%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "2.25",
%%%     date            = "27 September 2023",
%%%     time            = "17:02:58 MDT",
%%%     filename        = "sigplan2010.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "08201 139718 750618 7192531",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "bibliography, BibTeX, programming languages,
%%%                        SIGPLAN",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE bibliography of ACM SIGPLAN
%%%                        Notices, covering 2010--2019.
%%%
%%%                        There are World Wide Web sites for the
%%%                        journal at
%%%
%%%                            http://www.acm.org/sigplan/
%%%                            https://dl.acm.org/loi/sigplan
%%%
%%%                        and
%%%
%%%                            http://www.rowan.edu/sigplan/
%%%
%%%                        and coverage of about a dozen volumes can be found at
%%%
%%%                            http://ftp.informatik.rwth-aachen.de/dblp/db/journals/sigplan/index.html
%%%
%%%                        Several conference proceedings are published
%%%                        as volumes of SIGPLAN Notices.  Many of them
%%%                        can also be found via the ACM proceedings Web
%%%                        sites:
%%%
%%%                            http://www.acm.org/pubs/contents/proceedings/
%%%                            http://www.acm.org/pubs/contents/proceedings/asplos/
%%%                            http://www.acm.org/pubs/contents/proceedings/plan/
%%%                            http://www.acm.org/pubs/contents/proceedings/pldi/
%%%
%%%                        At version 2.25, the COMPLETE year coverage
%%%                        looks like this:
%%%
%%%                             2010 ( 355)    2013 ( 377)    2016 ( 378)
%%%                             2011 ( 370)    2014 ( 354)    2017 ( 343)
%%%                             2012 ( 375)    2015 ( 389)    2018 ( 247)
%%%
%%%                             Article:       3188
%%%
%%%                             Total entries: 3188
%%%
%%%                        Some of the bibliography entries in this
%%%                        file contain abstracts.  These are governed
%%%                        by the ACM Copyright Notice for ACM SIGPLAN
%%%                        Notices, which says:
%%%
%%%                             ``Permission to copy without fee all
%%%                             or part of this material is granted
%%%                             provided that the copies are not made
%%%                             or distributed for commercial
%%%                             advantage, the ACM copyright notice
%%%                             and the title of the publication and
%%%                             its date appear, and notice is given
%%%                             that copying is by permission of the
%%%                             Association for Computing Machinery.
%%%                             To copy otherwise, or to republish,
%%%                             requires a fee and/or specific
%%%                             permission.''
%%%
%%%                        Inasmuch as this bibliography, and its
%%%                        companion files in the master collection,
%%%                        is freely distributed without charge,
%%%                        inclusion of article abstracts clearly
%%%                        falls within the copyright permissions, and
%%%                        this author considers that ACM has given
%%%                        the required permission under the terms of
%%%                        the above Copyright Notice.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using bibsort -byvolume.
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================

@Preamble{
  "\input bibnames.sty " #
  "\input path.sty " #
  "\def \TM {${}^{\sc TM}$} " #
  "\ifx \undefined \circled \def \circled #1{(#1)} \fi" #
  "\ifx \undefined \reg     \def \reg {\circled{R}} \fi" #
  "\hyphenation{ }"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:

@String{j-SIGPLAN               = "ACM SIG{\-}PLAN Notices"}

%%% ====================================================================
%%% Publisher abbreviations:

@String{pub-ACM                 = "ACM Press"}

@String{pub-ACM:adr             = "New York, NY, USA"}

@String{pub-AW                  = "Ad{\-d}i{\-s}on-Wes{\-l}ey"}

@String{pub-AW:adr              = "Reading, MA, USA"}

%%% ====================================================================
%%% Series abbreviations:

@String{ser-SIGPLAN               = "ACM SIG{\-}PLAN Notices"}

%%% ====================================================================
%%% Bibliography entries, in publication order:

@Article{Gershenfeld:2010:RAL,
  author =       "Neil Gershenfeld and David Dalrymple and Kailiang Chen
                 and Ara Knaian and Forrest Green and Erik D. Demaine
                 and Scott Greenwald and Peter Schmidt-Nielsen",
  title =        "Reconfigurable asynchronous logic automata: {(RALA)}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "1--6",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Atig:2010:VPW,
  author =       "Mohamed Faouzi Atig and Ahmed Bouajjani and Sebastian
                 Burckhardt and Madanlal Musuvathi",
  title =        "On the verification problem for weak memory models",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "7--18",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Koskinen:2010:CGT,
  author =       "Eric Koskinen and Matthew Parkinson and Maurice
                 Herlihy",
  title =        "Coarse-grained transactions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "19--30",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Attiya:2010:SVS,
  author =       "H. Attiya and G. Ramalingam and N. Rinetzky",
  title =        "Sequential verification of serializability",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "31--42",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Godefroid:2010:CMM,
  author =       "Patrice Godefroid and Aditya V. Nori and Sriram K.
                 Rajamani and Sai Deep Tetali",
  title =        "Compositional may-must program analysis: unleashing
                 the power of alternation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "43--56",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chaudhuri:2010:CAP,
  author =       "Swarat Chaudhuri and Sumit Gulwani and Roberto
                 Lublinerman",
  title =        "Continuity analysis of programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "57--70",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Harris:2010:PAS,
  author =       "William R. Harris and Sriram Sankaranarayanan and
                 Franjo Ivan{\v{c}}i{\'c} and Aarti Gupta",
  title =        "Program analysis via satisfiability modulo path
                 programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "71--82",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tristan:2010:SVV,
  author =       "Jean-Baptiste Tristan and Xavier Leroy",
  title =        "A simple, verified validator for software pipelining",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "83--92",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chlipala:2010:VCI,
  author =       "Adam Chlipala",
  title =        "A verified compiler for an impure functional
                 language",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "93--106",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Myreen:2010:VJT,
  author =       "Magnus O. Myreen",
  title =        "Verified just-in-time compiler on {x86}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "107--118",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Terauchi:2010:DTC,
  author =       "Tachio Terauchi",
  title =        "Dependent types from counterexamples",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "119--130",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Rondon:2010:LLL,
  author =       "Patrick Maxim Rondon and Ming Kawaguchi and Ranjit
                 Jhala",
  title =        "Low-level liquid types",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "131--144",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Schafer:2010:TID,
  author =       "Max Sch{\"a}fer and Oege de Moor",
  title =        "Type inference for datalog with complex type
                 hierarchies",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "145--156",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Henzinger:2010:BQN,
  author =       "Thomas A. Henzinger",
  title =        "From {Boolean} to quantitative notions of
                 correctness",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "157--158",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pitts:2010:NS,
  author =       "Andrew M. Pitts",
  title =        "Nominal system {T}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "159--170",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hobor:2010:TIA,
  author =       "Aquinas Hobor and Robert Dockins and Andrew W.
                 Appel",
  title =        "A theory of indirection via approximation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "171--184",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dreyer:2010:RML,
  author =       "Derek Dreyer and Georg Neis and Andreas Rossberg and
                 Lars Birkedal",
  title =        "A relational modal logic for higher-order stateful
                 {ADTs}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "185--198",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Suter:2010:DPA,
  author =       "Philippe Suter and Mirco Dotta and Viktor Kuncak",
  title =        "Decision procedures for algebraic data types with
                 abstractions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "199--210",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Magill:2010:ANA,
  author =       "Stephen Magill and Ming-Hsien Tsai and Peter Lee and
                 Yih-Kuen Tsay",
  title =        "Automatic numeric abstractions for heap-manipulating
                 programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "211--222",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jost:2010:SDQ,
  author =       "Steffen Jost and Kevin Hammond and Hans-Wolfgang Loidl
                 and Martin Hofmann",
  title =        "Static determination of quantitative resource usage
                 for higher-order programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "223--236",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Malecha:2010:TVR,
  author =       "Gregory Malecha and Greg Morrisett and Avraham Shinnar
                 and Ryan Wisnesky",
  title =        "Toward a verified relational database management
                 system",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "237--248",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Podelski:2010:CGF,
  author =       "Andreas Podelski and Thomas Wies",
  title =        "Counterexample-guided focus",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "249--260",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nanevski:2010:SVH,
  author =       "Aleksandar Nanevski and Viktor Vafeiadis and Josh
                 Berdine",
  title =        "Structuring the verification of heap-manipulating
                 programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "261--274",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jia:2010:DTP,
  author =       "Limin Jia and Jianzhou Zhao and Vilhelm Sj{\"o}berg
                 and Stephanie Weirich",
  title =        "Dependent types and program equivalence",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "275--286",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hutchins:2010:PSS,
  author =       "DeLesley S. Hutchins",
  title =        "Pure subtype systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "287--298",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gay:2010:MST,
  author =       "Simon J. Gay and Vasco T. Vasconcelos and Ant{\'o}nio
                 Ravara and Nils Gesbert and Alexandre Z. Caldeira",
  title =        "Modular session types for distributed object-oriented
                 programming",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "299--312",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Srivastava:2010:PVP,
  author =       "Saurabh Srivastava and Sumit Gulwani and Jeffrey S.
                 Foster",
  title =        "From program verification to program synthesis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "313--326",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Vechev:2010:AGS,
  author =       "Martin Vechev and Eran Yahav and Greta Yorsh",
  title =        "Abstraction-guided synthesis of synchronization",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "327--338",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bodik:2010:PAN,
  author =       "Rastislav Bodik and Satish Chandra and Joel Galenson
                 and Doug Kimelman and Nicholas Tung and Shaon Barman
                 and Casey Rodarmor",
  title =        "Programming with angelic nondeterminism",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "339--352",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Greenberg:2010:CMM,
  author =       "Michael Greenberg and Benjamin C. Pierce and Stephanie
                 Weirich",
  title =        "Contracts made manifest",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "353--364",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Siek:2010:TB,
  author =       "Jeremy G. Siek and Philip Wadler",
  title =        "Threesomes, with and without blame",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "365--376",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wrigstad:2010:ITU,
  author =       "Tobias Wrigstad and Francesco Zappa Nardelli and
                 Sylvain Lebresne and Johan {\"O}stlund and Jan Vitek",
  title =        "Integrating typed and untyped code in a scripting
                 language",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "377--388",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tate:2010:GCO,
  author =       "Ross Tate and Michael Stepp and Sorin Lerner",
  title =        "Generating compiler optimizations from proofs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "389--402",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dias:2010:AGI,
  author =       "Jo{\~a}o Dias and Norman Ramsey",
  title =        "Automatically generating instruction selectors using
                 declarative machine descriptions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "403--416",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jim:2010:SAD,
  author =       "Trevor Jim and Yitzhak Mandelbaum and David Walker",
  title =        "Semantics and algorithms for data-dependent grammars",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "417--430",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Broberg:2010:PRB,
  author =       "Niklas Broberg and David Sands",
  title =        "{Paralocks}: role-based information flow control and
                 beyond",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "431--444",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bhargavan:2010:MVS,
  author =       "Karthikeyan Bhargavan and C{\'e}dric Fournet and
                 Andrew D. Gordon",
  title =        "Modular verification of security protocol code by
                 typing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "445--456",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Martin:2010:DCO,
  author =       "Jean-Phillipe Martin and Michael Hicks and Manuel
                 Costa and Periklis Akritidis and Miguel Castro",
  title =        "Dynamically checking ownership policies in concurrent
                 {C}\slash {C++} programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "457--470",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Heizmann:2010:NI,
  author =       "Matthias Heizmann and Jochen Hoenicke and Andreas
                 Podelski",
  title =        "Nested interpolants",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "471--482",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Filinski:2010:MA,
  author =       "Andrzej Filinski",
  title =        "Monads in action",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "483--494",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kobayashi:2010:HOM,
  author =       "Naoki Kobayashi and Naoshi Tabuchi and Hiroshi Unno",
  title =        "Higher-order multi-parameter tree transducers and
                 recursion schemes for program verification",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "1",
  pages =        "495--508",
  month =        jan,
  year =         "2010",
  CODEN =        "SINODQ",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Mar 15 19:13:16 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nikhil:2010:UGP,
  author =       "Rishiyur S. Nikhil",
  title =        "Using {GPCE} principles for hardware systems and
                 accelerators: (bridging the gap to {HW} design)",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "1--2",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621608",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Moore's Law has precipitated a crisis in the creation
                 of hardware systems (ASICs and FPGAs)-how to design
                 such enormously complex concurrent systems quickly,
                 reliably and affordably? At the same time, portable
                 devices, the energy crisis, and high performance
                 computing present a related challenge-how to move
                 complex and high-performance algorithms from software
                 into hardware (for more speed and/or energy
                 efficiency)?\par

                 In this talk I will start with a brief technical
                 introduction to BSV, a language that directly addresses
                 these concerns. It uses ideas from Guarded Atomic
                 Actions (cf. Term Rewriting Systems, TLA+, Unity, and
                 EventB) to address complex concurrency with
                 scalability. It borrows from Haskell (types, type
                 classes, higher-order functions) for robustness and
                 powerful program generation (a.k.a. 'static
                 elaboration' to HW designers). And it is fully
                 synthesizable (compilable) into high-quality RTL
                 (Verilog/VHDL). I will then describe some of the
                 remarkable projects that BSV has enabled in industry
                 and academia today.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "Bluespec Systemverilog; BSV; energy efficient
                 computing; FPGA; hardware accelerators;
                 hardware/software codesign; Haskell; high level
                 synthesis; high performance computing; hybrid
                 computing; term rewriting systems",
}

@Article{Cordy:2010:EOO,
  author =       "James R. Cordy",
  title =        "Eating our own dog food: {DSLs} for generative and
                 transformational engineering",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "3--4",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837852.1621609",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Languages and systems to support generative and
                 transformational solutions have been around a long
                 time. Systems such as XVCL, DMS, ASF+SDF, Stratego and
                 TXL have proven mature, efficient and effective in a
                 wide range of applications. Even so, adoption remains a
                 serious issue - almost all successful production
                 applications of these systems in practice either
                 involve help from the original authors or years of
                 experience to get rolling. While work on accessibility
                 is active, with efforts such as ETXL, Stratego XT,
                 Rascal and Colm, the fundamental big step remains -
                 it's not obvious how to apply a general purpose
                 transformational system to any given generation or
                 transformation problem, and the real power is in the
                 paradigms of use, not the languages themselves.\par

                 In this talk I will propose an agenda for addressing
                 this problem by taking our own advice - designing and
                 implementing domain specific languages (DSLs) for
                 specific generative, transformational and analysis
                 problem domains. We widely advise end users of the need
                 for DSLs for their kinds of problems - why not for our
                 kinds? And we use our tools for implementing their DSLs
                 - why not our own? I will outline a general method for
                 using transformational techniques to implement
                 transformational and generative DSLs, and review
                 applications of the method to implementing example
                 text-based DSLs for model-based code generation and
                 static code analysis. Finally, I will outline some
                 first steps in implementing model transformation DSLs
                 using the same idea - retaining the maturity and
                 efficiency of our existing tools while bringing them to
                 the masses by 'eating our own dogfood'.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "domain-specific languages; generative programming;
                 model driven engineering; source transformation
                 systems",
}

@Article{Willcock:2010:RGP,
  author =       "Jeremiah James Willcock and Andrew Lumsdaine and
                 Daniel J. Quinlan",
  title =        "Reusable, generic program analyses and
                 transformations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "5--14",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621611",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The optimizations in modern compilers are constructed
                 for a predetermined set of primitive types. As a
                 result, programmers are unable to exploit optimizations
                 for user-defined types where these optimizations would
                 be correct and beneficial. Moreover, because the set of
                 optimizations is also fixed, programmers are unable to
                 incorporate new optimizations into the compiler. To
                 address these limitations, we apply the reuse
                 methodologies from generic programming to compiler
                 analyses and optimizations. To enable compilers to
                 apply optimizations to classes of types rather than
                 particular types, we define optimizations in terms of
                 generic interface descriptions (similar to C++ concepts
                 or Haskell type classes). By extending these interface
                 descriptions to include associated program analysis and
                 transformation fragments, we enable compilers to
                 incorporate user-defined transformations and analyses.
                 Since these transformations are explicitly associated
                 with interface descriptions, they can be applied in
                 generic fashion by the compiler. We demonstrate that
                 classical compiler optimizations, when generalized
                 using this framework, can apply to a broad range of
                 types, both built-in and user-defined. Finally, we
                 present an initial implementation, the principles of
                 which are generalizable to other compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compiler optimization; generic programming",
}

@Article{Bagge:2010:ASB,
  author =       "Anya Helene Bagge and Valentin David and Magne
                 Haveraaen",
  title =        "The axioms strike back: testing with concepts and
                 axioms in {C++}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "15--24",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621612",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern development practises encourage extensive
                 testing of code while it is still under development,
                 using unit tests to check individual code units in
                 isolation. Such tests are typically case-based,
                 checking a likely error scenario or an error that has
                 previously been identified and fixed. Coming up with
                 good test cases is challenging, and focusing on
                 individual tests can distract from creating tests that
                 cover the full functionality.\par

                 Axioms, known from program specification, allow for an
                 alternative way of generating test cases, where the
                 intended functionality is described as rules or
                 equations that can be checked automatically. Axioms are
                 proposed as part of the {\em concept\/} feature of the
                 upcoming C++0x standard.\par

                 In this paper, we describe how tests may be generated
                 automatically from axioms in C++ concepts, and supplied
                 with appropriate test data to form effective automated
                 unit tests.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "algebraic specification; axiom-based testing; axioms;
                 C++; C++0x; concepts; generative programming; mouldable
                 programming; program transformation; test generation;
                 unit testing",
}

@Article{Garcia:2010:TFT,
  author =       "Ronald Garcia and Andrew Lumsdaine",
  title =        "Toward foundations for type-reflective
                 metaprogramming",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "25--34",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621613",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "C++ template metaprogramming has been used with great
                 success to build software applications and libraries.
                 In practice, however, template metaprogramming suffers
                 usability, reliability, and capability shortcomings,
                 and it is not well understood in theory. Template
                 metaprogramming has these problems because it relies on
                 emergent properties of disparate language features that
                 were tailored to other purposes. As a step toward solid
                 and sound language support for metaprogramming, this
                 paper establishes firm semantic foundations for select
                 capabilities of template metaprogramming.\par

                 We analyze C++ and the idioms of template
                 metaprogramming and isolate, in a language-neutral
                 fashion, fundamental capabilities of C++ that enable
                 metaprogramming. Guided by this analysis, we present a
                 design for a core calculus that directly expresses
                 fundamental metaprogramming capabilities, including
                 static computation, code generation, and type
                 reflection. We prove a typesafety property for
                 compile-time evaluation of metaprograms. To formally
                 connect the core calculus to programming practice, we
                 present a more convenient surface language for
                 metaprogramming. Its semantics are captured by
                 type-directed translation to the core calculus. We
                 prove that this translation preserves
                 well-typing.\par

                 This idealized presentation averts some of the
                 shortcomings of C++ template metaprogramming and
                 provides a framework for further study.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "c++; metaprogramming; reflection; semantics",
}

@Article{Sadat-Mohtasham:2010:TPD,
  author =       "Hossein Sadat-Mohtasham and H. James Hoover",
  title =        "Transactional pointcuts: designation reification and
                 advice of interrelated join points",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "35--44",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837852.1621615",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Aspect-oriented mechanisms are characterized by their
                 join point models. A join point model has three
                 components: join points, which are elements of language
                 semantics; 'a means of identifying join points'; and 'a
                 means of affecting the behaviour at those join points.'
                 A pointcut-advice model is a dynamic join point model
                 in which join points are points in program execution.
                 Pointcuts select a set of join points, and advice
                 affects the behaviour of the selected join points. In
                 this model, join points are typically selected and
                 advised independently of each other. That is, the
                 relationships between join points are not taken into
                 account in join point selection and advice. In
                 practice, join points are often not independent.
                 Instead, they form part of a higher-level operation
                 that implements the intent of the developer ({\em
                 e.g.\/} managing a resource). There are natural
                 situations in which join points should be selected only
                 if they play a specific role in that operation.\par

                 We propose a new join point model that takes join point
                 interrelationships into account and allows the
                 designation of more complex computations as join
                 points. Based on the new model, we have designed an
                 aspect-oriented construct called a {\em transactional
                 pointcut (transcut)}. Transcuts select sets of
                 interrelated join points and reify them into
                 higher-level join points that can be advised. They
                 share much of the machinery and intuition of pointcuts,
                 and can be viewed as their natural extension. We have
                 implemented a transcuts prototype as an extension to
                 the AspectJ language and integrated it into the abc
                 compiler. We present an example where a transcut is
                 applied to implement recommended resource handling
                 practices in the presence of exceptions within method
                 boundaries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "aspect-oriented programming; join point model;
                 transactional pointcut",
}

@Article{Akai:2010:EAS,
  author =       "Shumpei Akai and Shigeru Chiba",
  title =        "Extending {AspectJ} for separating regions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "45--54",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837852.1621616",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Synchronization is a good candidate for an aspect in
                 aspect-oriented programming (AOP) since programmers
                 have to choose the best granularity of synchronization
                 for the underlying hardware to obtain the best
                 execution performance. If synchronization is an aspect,
                 programmers can change the synchronization code
                 independently of the rest of the program when the
                 program runs on different hardware. However, existing
                 AOP languages such as AspectJ have problems. They
                 cannot select an arbitrary code region as a join point.
                 Moreover, they cannot enforce weaving of a
                 synchronization aspect. Since it is an alternative
                 feature in feature modeling, at least one of available
                 synchronization aspects must be woven. Otherwise, the
                 program would be thread-unsafe. Since an aspect in
                 AspectJ is inherently optional, programmers must be
                 responsible for weaving it. To solve these problems,
                 this paper proposes two new constructs for AspectJ,
                 {\em regioncut\/} and {\em assertions for advice}.
                 Regioncut selects arbitrary code region as a join point
                 and assertion for advice enforces weaving a mandatory
                 advice. We implemented these constructs by extending
                 the AspectBench compiler. We evaluated the design of
                 our constructs by applying them to two open-source
                 software products, Javassist and Hadoop.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "aspect-oriented programming; feature-oriented
                 programming; region; synchronization",
}

@Article{Liu:2010:LFI,
  author =       "Yanhong A. Liu and Michael Gorbovitski and Scott D.
                 Stoller",
  title =        "A language and framework for invariant-driven
                 transformations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "55--64",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837852.1621617",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes a language and framework that
                 allow coordinated transformations driven by invariants
                 to be specified declaratively, as invariant rules, and
                 applied automatically. The framework supports
                 incremental maintenance of invariants for program
                 design and optimization, as well as general
                 transformations for instrumentation, refactoring, and
                 other purposes. This paper also describes our
                 implementations for transforming Python and C programs
                 and experiments with successful applications of the
                 systems in generating efficient implementations from
                 clear and modular specifications, in instrumenting
                 programs for runtime verification, profiling, and
                 debugging, and in code refactoring.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "incremental maintenance; invariants; program
                 optimization; program transformation; runtime invariant
                 checking",
}

@Article{Wehr:2010:JBP,
  author =       "Stefan Wehr and Peter Thiemann",
  title =        "{JavaGI} in the battlefield: practical experience with
                 generalized interfaces",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "65--74",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621619",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Generalized interfaces are an extension of the
                 interface concept found in object-oriented languages
                 such as Java or C\#. The extension is inspired by
                 Haskell's type classes. It supports retroactive and
                 type-conditional interface implementations, binary
                 methods, symmetric multimethods, interfaces over
                 families of types, and static interface
                 methods.\par

                 This article reports practical experience with
                 generalized interfaces as implemented in the JavaGI
                 language. Several real-world case studies demonstrate
                 how generalized interfaces provide solutions to
                 extension and integration problems with components in
                 binary form, how they make certain design patterns
                 redundant, and how they eliminate various run-time
                 errors. In each case study, the use of JavaGI results
                 in elegant and highly readable code.\par

                 Furthermore, the article discusses the implementation
                 of a compiler and a run-time system for JavaGI.
                 Benchmarks show that our implementation offers
                 acceptable performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "case studies; external methods; JavaGI; multimethods;
                 retroactive interface implementation",
}

@Article{McGachey:2010:CJC,
  author =       "Phil McGachey and Antony L. Hosking and J. Eliot B.
                 Moss",
  title =        "Classifying {Java} class transformations for pervasive
                 virtualized access",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "75--84",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621620",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The indirection of object accesses is a common theme
                 for target domains as diverse as transparent
                 distribution, persistence, and program instrumentation.
                 Virtualizing accesses to fields and methods (by
                 redirecting calls through accessor and indirection
                 methods) allows interposition of arbitrary code,
                 extending the functionality of an application beyond
                 that intended by the original developer.\par

                 We present class modifications performed by our RuggedJ
                 transparent distribution platform for standard Java
                 virtual machines. RuggedJ abstracts over the location
                 of objects by implementing a single object model for
                 local and remote objects. However the implementation of
                 this model is complicated by the presence of native and
                 system code; classes loaded by Java's bootstrap class
                 loader can be rewritten only in a limited manner, and
                 so cannot be modified to conform to RuggedJ's complex
                 object model. We observe that system code comprises the
                 majority of a given Java application: an average of
                 76\% in the applications we study. We consider the
                 constraints imposed upon pervasive class transformation
                 within Java, and present a framework for systematically
                 rewriting arbitrary applications. Our system
                 accommodates all system classes, allowing both user and
                 system classes alike to be referenced using a single
                 object model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "Java; object model; program transformation",
}

@Article{Villazon:2010:ARA,
  author =       "Alex Villaz{\'o}n and Walter Binder and Danilo
                 Ansaloni and Philippe Moret",
  title =        "Advanced runtime adaptation for {Java}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "85--94",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621621",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic aspect-oriented programming (AOP) enables
                 runtime adaptation of aspects, which is important for
                 building sophisticated, aspect-based software
                 engineering tools, such as adaptive profilers or
                 debuggers that dynamically modify instrumentation code
                 in response to user interactions. Today, many AOP
                 frameworks for Java, notably AspectJ, focus on aspect
                 weaving at compile-time or at load-time, and offer only
                 limited support for aspect adaptation and reweaving at
                 runtime. In this paper, we introduce HotWave, an AOP
                 framework based on AspectJ for standard Java Virtual
                 Machines (JVMs). HotWave supports dynamic (re)weaving
                 of previously loaded classes, and it ensures that all
                 classes loaded in a JVM can be (re)woven, including the
                 classes of the standard Java class library. HotWave
                 features a novel mechanism for inter-advice
                 communication, enabling efficient data passing between
                 advices that are woven into the same method. We explain
                 HotWave's programming model and discuss our
                 implementation techniques. As case study, we present an
                 adaptive, aspect-based profiler that leverages
                 HotWave's distinguishing features.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "AspectJ; bytecode instrumentation; code hotswapping;
                 dynamic aspect-oriented programming; Java Virtual
                 Machine; runtime aspect adaptation and (re)weaving",
}

@Article{Villazon:2010:HCA,
  author =       "Alex Villaz{\'o}n and Walter Binder and Danilo
                 Ansaloni and Philippe Moret",
  title =        "{HotWave}: creating adaptive tools with dynamic
                 aspect-oriented programming in {Java}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "95--98",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621622",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Developing tools for profiling, debugging, testing,
                 and reverse engineering is error-prone, time-consuming,
                 and therefore costly when using low-level techniques,
                 such as bytecode instrumentation. As a solution to
                 these problems, we promote tool development in Java
                 using high-level aspect-oriented programming (AOP). We
                 demonstrate that the use of aspects yields compact
                 tools that are easy to develop and extend. As enabling
                 technology, we rely on HotWave, a new tool for dynamic
                 and comprehensive aspect weaving. HotWave reconciles
                 compatibility with existing virtual machine and AOP
                 technologies. It provides support for runtime
                 adaptation of aspects and reweaving of previously
                 loaded code, as well as the ability to weave aspects
                 into all methods executing in a Java Virtual Machine,
                 including methods in the standard Java class library.
                 HotWave also features a new mechanism for efficiently
                 passing data between advices that are woven into the
                 same method. We demonstrate the benefits of HotWave's
                 distinguishing features with two case studies in the
                 area of profiling.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "AspectJ; bytecode instrumentation; code hotswapping;
                 dynamic aspect-oriented programming; Java Virtual
                 Machine; profiling; runtime weaving",
}

@Article{Heidenreich:2010:GST,
  author =       "Florian Heidenreich and Jendrik Johannes and Mirko
                 Seifert and Christian Wende and Marcel B{\"o}hme",
  title =        "Generating safe template languages",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "99--108",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621624",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Template languages are widely used within generative
                 programming, because they provide intuitive means to
                 generate software artefacts expressed in a specific
                 object language. However, most template languages
                 perform template instantiation on the level of string
                 literals, which allows neither syntax checks nor
                 semantics analysis. To make sure that generated
                 artefacts always conform to the object language, we
                 propose to perform static analysis at template design
                 time. In addition, the increasing popularity of
                 domain-specific languages (DSLs) demands an approach
                 that allows to reuse both the concepts of template
                 languages and the corresponding tools.\par

                 In this paper we address the issues mentioned above by
                 presenting how existing languages can be automatically
                 extended with generic template concepts (e.g.,
                 placeholders, loops, conditions) to obtain safe
                 template languages. These languages provide means for
                 syntax checking and static semantic analysis w.r.t. the
                 object language at template design time. We discuss the
                 prerequisites for this extension, analyse the types of
                 correctness properties that can be assured at template
                 design time, and exemplify the key benefits of this
                 approach on a textual DSL and Java.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "generative programming; language extension; safe
                 authoring; template language",
}

@Article{Kong:2010:APT,
  author =       "Soonho Kong and Wontae Choi and Kwangkeun Yi",
  title =        "Abstract parsing for two-staged languages with
                 concatenation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "109--116",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621625",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This article, based on Doh, Kim, and Schmidt's
                 'abstract parsing' technique, presents an abstract
                 interpretation for statically checking the syntax of
                 generated code in two-staged programs. Abstract parsing
                 is a static analysis technique for checking the syntax
                 of generated strings. We adopt this technique for
                 two-staged programming languages and formulate it in
                 the abstract interpretation framework. We parameterize
                 our analysis with the abstract domain so that one can
                 choose the abstract domain as long as it satisfies the
                 condition we provide. We also present an instance of
                 the abstract domain, namely an abstract parse stack and
                 its widening with k-cutting.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "abstract interpretation; multi-staged languages;
                 parsing; program analysis",
}

@Article{Nedunuri:2010:SFP,
  author =       "Srinivas Nedunuri and William R. Cook",
  title =        "Synthesis of fast programs for maximum segment sum
                 problems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "117--126",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837852.1621626",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is well-known that a naive algorithm can often be
                 turned into an efficient program by applying
                 appropriate semantics-preserving transformations. This
                 technique has been used to derive programs to solve a
                 variety of maximum-sum programs. One problem with this
                 approach is that each problem variation requires a new
                 set of transformations to be derived. An alternative
                 approach to generation combines problem specifications
                 with flexible algorithm theories to derive efficient
                 algorithms. We show how this approach can be
                 implemented in Haskell and applied to solve constraint
                 satisfaction problems. We illustrate this technique by
                 deriving programs for three varieties of
                 maximum-weightsum problem. The derivations of the
                 different programs are similar, and the resulting
                 programs are asymptotically faster in practice than the
                 programs created by transformation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "algorithms; branch-and-bound; formal methods; program
                 synthesis; segment-sum problems",
}

@Article{Radermacher:2010:GEI,
  author =       "Ansgar Radermacher and Arnaud Cuccuru and Sebastien
                 Gerard and Fran{\c{c}}ois Terrier",
  title =        "Generating execution infrastructures for
                 component-oriented specifications with a model driven
                 toolchain: a case study for {MARTE}'s {GCM} and
                 real-time annotations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "127--136",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621628",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The development of embedded Systems becomes more and
                 more complex. Model driven engineering can help to
                 manage this complexity by specifying real-time
                 properties in a declarative way and automating the
                 deployment. The UML profile MARTE is a OMG standard
                 that allows to model real-time properties. However,
                 there is no execution infrastructure that supports
                 MARTE's generic component model (GCM) and the
                 application modeling (HLAM).\par

                 The contribution of the paper is twofold: it presents a
                 proposition of a component model with flexible
                 interaction support that allows to tailor code
                 generation to domain and target requirements. Second,
                 it will show how MARTE's GCM concepts can be
                 implemented by means of the proposed component model.
                 The proposed component model has been largely developed
                 in the context of the French national project
                 Flex-eWare with the intention to unify major components
                 model, notably the CORBA component model (CCM) and
                 Fractal. The paper explains the major elements of this
                 model in detail and shows how specific connector and
                 containers can implement MARTE specifications. We
                 present the tool support that is integrated into a UML
                 modeler and based on model-to-model and model to text
                 transformations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "component models; connectors; MARTE; middleware;
                 model-driven engineering",
}

@Article{Cassou:2010:GPA,
  author =       "Damien Cassou and Benjamin Bertran and Nicolas Loriant
                 and Charles Consel",
  title =        "A generative programming approach to developing
                 pervasive computing systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "137--146",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621629",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Developing pervasive computing applications is a
                 difficult task because it requires to deal with a wide
                 range of issues: heterogeneous devices, entity
                 distribution, entity coordination, low-level hardware
                 knowledge. \ldots{}  Besides requiring various areas of
                 expertise, programming such applications involves
                 writing a lot of administrative code to glue
                 technologies together and to interface with both
                 hardware and software components.\par

                 This paper proposes a generative programming approach
                 to providing programming, execution and simulation
                 support dedicated to the pervasive computing domain.
                 This approach relies on a domain-specific language,
                 named DiaSpec, dedicated to the description of
                 pervasive computing systems. Our generative approach
                 factors out features of distributed systems
                 technologies, making DiaSpec-specified software systems
                 portable.\par

                 The DiaSpec compiler is implemented and has been used
                 to generate dedicated programming frameworks for a
                 variety of pervasive computing applications, including
                 detailed ones to manage the building of an engineering
                 school.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "DSL; generative programming; pervasive computing",
}

@Article{Jarvi:2010:AUI,
  author =       "Jaakko J{\"a}rvi and Mat Marcus and Sean Parent and
                 John Freeman and Jacob Smith",
  title =        "Algorithms for user interfaces",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "147--156",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621630",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "User interfaces for modern applications must support a
                 rich set of interactive features. It is commonplace to
                 find applications with dependencies between values
                 manipulated by user interface elements, conditionally
                 enabled controls, and script record-ability and
                 playback against different documents. A significant
                 fraction of the application programming effort is
                 devoted to implementing such functionality, and the
                 resulting code is typically not reusable.\par

                 This paper extends our 'property models' approach to
                 programming user interfaces. Property models allow a
                 large part of the functionality of a user interface to
                 be implemented in reusable libraries, reducing
                 application specific code to a set of declarative
                 rules. We describe how, as a by-product of computations
                 that maintain the values of user interface elements,
                 property models obtain accurate information of the
                 currently active dependencies among those elements.
                 This information enables further expanding the class of
                 user interface functionality that we can encode as
                 generic algorithms. In particular, we describe
                 automating the decisions for the enablement of user
                 interface widgets and activation of command widgets.
                 Failing to disable or deactivate widgets correctly is a
                 common source of user-interface defects, which our
                 approach largely removes.\par

                 We report on the increased reuse, reduced defect rates,
                 and improved user interface design turnarounds in a
                 commercial software development effort as a result of
                 adopting our approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "component software; constraint systems; declarative
                 specifications; software reuse; user interfaces",
}

@Article{Kastner:2010:MRP,
  author =       "Christian K{\"a}stner and Sven Apel and Martin
                 Kuhlemann",
  title =        "A model of refactoring physically and virtually
                 separated features",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "157--166",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837852.1621632",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Physical separation with class refinements and method
                 refinements {\`a} la AHEAD and virtual separation using
                 annotations {\`a} la {\em \#ifdef\/} or CIDE are two
                 competing implementation approaches for software
                 product lines with complementary advantages. Although
                 both approaches have been mainly discussed in
                 isolation, we strive for an integration to leverage the
                 respective advantages. In this paper, we lay the
                 foundation for such an integration by providing a model
                 that supports both physical and virtual separation and
                 by describing refactorings in both directions. We prove
                 the refactorings complete, so every virtually separated
                 product line can be automatically transformed into a
                 physically separated one (replacing annotations by
                 refinements) and vice versa. To demonstrate the
                 feasibility of our approach, we have implemented the
                 refactorings in our tool CIDE and conducted four case
                 studies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "AHEAD; CIDE; FeatureHouse; preprocessor; refinements;
                 separation of concerns; software product lines",
}

@Article{Sanen:2010:MPS,
  author =       "Frans Sanen and Eddy Truyen and Wouter Joosen",
  title =        "Mapping problem-space to solution-space features: a
                 feature interaction approach",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "167--176",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837852.1621633",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mapping problem-space features into solution-space
                 features is a fundamental configuration problem in
                 software product line engineering. A configuration
                 problem is defined as generating the most optimal
                 combination of software features given a requirements
                 specification and given a set of configuration rules.
                 Current approaches however provide little support for
                 expressing complex configuration rules between problem
                 and solution space that support incomplete requirements
                 specifications. In this paper, we propose an approach
                 to model complex configuration rules based on a
                 generalization of the concept of problem-solution
                 feature interactions. These are interactions between
                 solution-space features that only arise in specific
                 problem contexts. The use of an existing tool to
                 support our approach is also discussed: we use the DLV
                 answer set solver to express a particular configuration
                 problem as a logic program whose answer set corresponds
                 to the optimal combinations of solution-space features.
                 We motivate and illustrate our approach with a case
                 study in the field of managing dynamic adaptations in
                 distributed software, where the goal is to generate an
                 optimal protocol for accommodating a given
                 adaptation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "configuration knowledge; default logic; distributed
                 runtime adaptation; DLV; problem-solution feature
                 interactions; software product line engineering",
}

@Article{Kuhlemann:2010:SCN,
  author =       "Martin Kuhlemann and Don Batory and Christian
                 K{\"a}stner",
  title =        "Safe composition of non-monotonic features",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "2",
  pages =        "177--186",
  month =        feb,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1621607.1621634",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:37:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programs can be composed from features. We want to
                 verify automatically that all legal combinations of
                 features can be composed safely without errors. Prior
                 work on this problem assumed that features add code
                 monotonically. We generalize prior work to enable
                 features to add {\em and remove\/} code, describe our
                 analyses and implementation, and review case studies.
                 We observe that more expressive features increase the
                 complexity of developed programs rapidly -- up to the
                 point where tools and automated concepts as presented
                 in this paper are indispensable for verification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "AHEAD; feature-oriented programming; refactoring; safe
                 composition",
}

@Article{Brewer:2010:TDR,
  author =       "Eric A. Brewer",
  title =        "Technology for developing regions: {Moore}'s law is
                 not enough",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "1--2",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736021",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The historic focus of development has rightfully been
                 on macroeconomics and good governance, but technology
                 has an increasingly large role to play. In this talk, I
                 review several novel technologies that we have deployed
                 in India and Africa, and discuss the challenges and
                 opportunities of this new subfield of EECS research.
                 Working with the Aravind Eye Hospital, we are currently
                 supporting doctor / patient videoconferencing in 30+
                 rural villages; more than 25,000 people have had their
                 blindness cured due to these exams.\par

                 Although Moore's Law has led to great cost reductions
                 and thus enabled new technologies, we have reached
                 essentially the low point for cost: the computing is
                 essentially free compared to the rest of the system.
                 The premium is thus on a combination of (1) deeper
                 integration (fewer compo-nents), (2) shared usage
                 models (even phones are shared), and (3) lower
                 operating costs in terms of power and connectivity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "developing regions; ictd; it for development.",
}

@Article{Ipek:2010:DRM,
  author =       "Engin Ipek and Jeremy Condit and Edmund B. Nightingale
                 and Doug Burger and Thomas Moscibroda",
  title =        "Dynamically replicated memory: building reliable
                 systems from nanoscale resistive memories",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "3--14",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736023",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "DRAM is facing severe scalability challenges in
                 sub-45nm technology nodes due to precise charge
                 placement and sensing hurdles in deep-submicron
                 geometries. Resistive memories, such as phase-change
                 memory (PCM), already scale well beyond DRAM and are a
                 promising DRAM replacement. Unfortunately, PCM is
                 write-limited, and current approaches to managing
                 writes must decommission pages of PCM when the first
                 bit fails.\par

                 This paper presents {\em dynamically replicated
                 memory\/} (DRM), the first hardware and operating
                 system interface designed for PCM that allows {\em
                 continued operation through graceful degradation\/}
                 when hard faults occur. DRM reuses memory pages that
                 contain hard faults by dynamically forming pairs of
                 complementary pages that act as a single page of
                 storage. No changes are required to the processor
                 cores, the cache hierarchy, or the operating system's
                 page tables. By changing the memory controller, the
                 TLBs, and the operating system to be DRM-aware, we can
                 improve the lifetime of PCM by up to 40x over
                 conventional error-detection techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "phase-change memory; write endurance",
}

@Article{Kirman:2010:PEA,
  author =       "Nevin Kirman and Jos{\'e} F. Mart{\'\i}nez",
  title =        "A power-efficient all-optical on-chip interconnect
                 using wavelength-based oblivious routing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "15--28",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736024",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an all-optical approach to constructing
                 data networks on chip that combines the following key
                 features: (1) Wavelength-based routing, where the route
                 followed by a packet depends solely on the wavelength
                 of its carrier signal, and not on information either
                 contained in the packet or traveling along with it. (2)
                 Oblivious routing, by which the wavelength (and thus
                 the route) employed to connect a source-destination
                 pair is invariant for that pair, and does not depend on
                 ongoing transmissions by other nodes, thereby
                 simplifying design and operation. And (3) passive
                 optical wavelength routers, whose routing pattern is
                 set at design time, which allows for area and power
                 optimizations not generally available to solutions that
                 use dynamic routing. Compared to prior proposals, our
                 evaluation shows that our solution is significantly
                 more power efficient at a similar level of
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "nanophotonics; on-chip network; optical network;
                 wavelength-based oblivious routing",
}

@Article{Neelakantam:2010:RSE,
  author =       "Naveen Neelakantam and David R. Ditzel and Craig
                 Zilles",
  title =        "A real system evaluation of hardware atomicity for
                 software speculation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "29--38",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736026",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper we evaluate the atomic region compiler
                 abstraction by incorporating it into a commercial
                 system. We find that atomic regions are simple and
                 intuitive to integrate into an x86 binary-translation
                 system. Furthermore, doing so trivially enables
                 additional optimization opportunities beyond that
                 achievable by a high-performance dynamic optimizer,
                 which already implements superblocks.\par

                 We show that atomic regions can suffer from severe
                 performance penalties if misspeculations are left
                 uncontrolled, but that a simple software control
                 mechanism is sufficient to reign in all detrimental
                 side-effects. We evaluate using full reference runs of
                 the SPEC CPU2000 integer benchmarks and find that
                 atomic regions enable up to a 9\% (3\% on average)
                 improvement beyond the performance of a tuned
                 product.\par

                 These performance improvements are achieved without any
                 negative side effects. Performance side effects such as
                 code bloat are absent with atomic regions; in fact,
                 static code size is reduced. The hardware necessary is
                 synergistic with other needs and was already available
                 on the commercial product used in our evaluation.
                 Finally, the software complexity is minimal as a single
                 developer was able to incorporate atomic regions into a
                 sophisticated 300,000 line code base in three months,
                 despite never having seen the translator source code
                 beforehand.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "atomicity; checkpoint; dynamic translation;
                 optimization; speculation",
}

@Article{Harris:2010:DFM,
  author =       "Tim Harris and Sa{\v{s}}a Tomic and Adri{\'a}n Cristal
                 and Osman Unsal",
  title =        "Dynamic filtering: multi-purpose architecture support
                 for language runtime systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "39--52",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736027",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces a new abstraction to accelerate
                 the read-barriers and write-barriers used by language
                 runtime systems. We exploit the fact that, dynamically,
                 many barrier executions perform checks but no real work
                 -- e.g., in generational garbage collection (GC),
                 frequent checks are needed to detect the creation of
                 inter-generational references, even though such
                 references occur rarely in many workloads. We introduce
                 a form of dynamic filtering that identifies redundant
                 checks by (i) recording checks that have recently been
                 executed, and (ii) detecting when a barrier is
                 repeating one of these checks. We show how this
                 technique can be applied to a variety of algorithms for
                 GC, transactional memory, and language-based security.
                 By supporting dynamic filtering in the instruction set,
                 we show that the fast-paths of these barriers can be
                 streamlined, reducing the impact on the quality of
                 surrounding code. We show how we accelerate the
                 barriers used for generational GC and transactional
                 memory in the Bartok research compiler. With a
                 2048-entry filter, dynamic filtering eliminates almost
                 all the overhead of the GC write-barriers. Dynamic
                 filtering eliminates around half the overhead of STM
                 over a non-synchronized baseline -- even when used with
                 an STM that is already designed for low overhead, and
                 which employs static analyses to avoid redundant
                 operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "garbage collection; runtime systems; transactional
                 memory",
}

@Article{Bergan:2010:CCR,
  author =       "Tom Bergan and Owen Anderson and Joseph Devietti and
                 Luis Ceze and Dan Grossman",
  title =        "{CoreDet}: a compiler and runtime system for
                 deterministic multithreaded execution",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "53--64",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736029",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The behavior of a multithreaded program does not
                 depend only on its inputs. Scheduling, memory
                 reordering, timing, and low-level hardware effects all
                 introduce nondeterminism in the execution of
                 multithreaded programs. This severely complicates many
                 tasks, including debugging, testing, and automatic
                 replication. In this work, we avoid these complications
                 by eliminating their root cause: we develop a compiler
                 and runtime system that runs arbitrary multithreaded
                 C/C++ POSIX Threads programs deterministically.\par

                 A trivial nonperformant approach to providing
                 determinism is simply deterministically serializing
                 execution. Instead, we present a compiler and runtime
                 infrastructure that ensures determinism but resorts to
                 serialization rarely, for handling interthread
                 communication and synchronization. We develop two basic
                 approaches, both of which are largely dynamic with
                 performance improved by some static compiler
                 optimizations. First, an ownership-based approach
                 detects interthread communication via an evolving table
                 that tracks ownership of memory regions by threads.
                 Second, a buffering approach uses versioned memory and
                 employs a deterministic commit protocol to make changes
                 visible to other threads. While buffering has larger
                 single-threaded overhead than ownership, it tends to
                 scale better (serializing less often). A hybrid system
                 sometimes performs and scales better than either
                 approach individually.\par

                 Our implementation is based on the LLVM compiler
                 infrastructure. It needs neither programmer annotations
                 nor special hardware. Our empirical evaluation uses the
                 PARSEC and SPLASH2 benchmarks and shows that our
                 approach scales comparably to nondeterministic
                 execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compilers; determinism; multicore; multithreading",
}

@Article{Raman:2010:SPU,
  author =       "Arun Raman and Hanjun Kim and Thomas R. Mason and
                 Thomas B. Jablin and David I. August",
  title =        "Speculative parallelization using software
                 multi-threaded transactions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "65--76",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736030",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the right techniques, multicore architectures may
                 be able to continue the exponential performance trend
                 that elevated the performance of applications of all
                 types for decades. While many scientific programs can
                 be parallelized without speculative techniques,
                 speculative parallelism appears to be the key to
                 continuing this trend for general-purpose applications.
                 Recently-proposed code parallelization techniques, such
                 as those by Bridges et al. and by Thies et al.,
                 demonstrate scalable performance on multiple cores by
                 using speculation to divide code into atomic units
                 (transactions) that span multiple threads in order to
                 expose data parallelism. Unfortunately, most software
                 and hardware Thread-Level Speculation (TLS) memory
                 systems and transactional memories are not sufficient
                 because they only support single-threaded atomic units.
                 Multi-threaded Transactions (MTXs) address this
                 problem, but they require expensive hardware support as
                 currently proposed in the literature. This paper
                 proposes a Software MTX (SMTX) system that captures the
                 {\em applicability\/} and {\em performance\/} of
                 hardware MTX, but on {\em existing multicore machines}.
                 The SMTX system yields a harmonic mean speedup of
                 13.36x on native hardware with four 6-core processors
                 (24 cores in total) running speculatively parallelized
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "automatic parallelization; loop-level parallelism;
                 multi-threaded transactions; pipelined parallelism;
                 software transactional memory; thread-level
                 speculation",
}

@Article{Lee:2010:REO,
  author =       "Dongyoon Lee and Benjamin Wester and Kaushik
                 Veeraraghavan and Satish Narayanasamy and Peter M. Chen
                 and Jason Flinn",
  title =        "{Respec}: efficient online multiprocessor replay via
                 speculation and external determinism",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "77--90",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736031",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Deterministic replay systems record and reproduce the
                 execution of a hardware or software system. While it is
                 well known how to replay uniprocessor systems,
                 replaying shared memory multiprocessor systems at low
                 overhead on commodity hardware is still an open
                 problem. This paper presents Respec, a new way to
                 support deterministic replay of shared memory
                 multithreaded programs on commodity multiprocessor
                 hardware. Respec targets online replay in which the
                 recorded and replayed processes execute
                 concurrently.\par

                 Respec uses two strategies to reduce overhead while
                 still ensuring correctness: speculative logging and
                 externally deterministic replay. Speculative logging
                 optimistically logs less information about shared
                 memory dependencies than is needed to guarantee
                 deterministic replay, then recovers and retries if the
                 replayed process diverges from the recorded process.
                 Externally deterministic replay relaxes the degree to
                 which the two executions must match by requiring only
                 their system output and final program states match. We
                 show that the combination of these two techniques
                 results in low recording and replay overhead for the
                 common case of data-race-free execution intervals and
                 still ensures correct replay for execution intervals
                 that have data races.\par

                 We modified the Linux kernel to implement our
                 techniques. Our software system adds on average about
                 18\% overhead to the execution time for recording and
                 replaying programs with two threads and 55\% overhead
                 for programs with four threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "external determinism; replay; speculative execution",
}

@Article{Eyerman:2010:PJS,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Probabilistic job symbiosis modeling for {SMT}
                 processor scheduling",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "91--102",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736033",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Symbiotic job scheduling boosts simultaneous
                 multithreading (SMT) processor performance by
                 co-scheduling jobs that have `compatible' demands on
                 the processor's shared resources. Existing approaches
                 however require a sampling phase, evaluate a limited
                 number of possible co-schedules, use heuristics to
                 gauge symbiosis, are rigid in their optimization
                 target, and do not preserve system-level
                 priorities/shares.\par

                 This paper proposes probabilistic job symbiosis
                 modeling, which predicts whether jobs will create
                 positive or negative symbiosis when co-scheduled
                 without requiring the co-schedule to be evaluated. The
                 model, which uses per-thread cycle stacks computed
                 through a previously proposed cycle accounting
                 architecture, is simple enough to be used in system
                 software. Probabilistic job symbiosis modeling provides
                 six key innovations over prior work in symbiotic job
                 scheduling: (i) it does not require a sampling phase,
                 (ii) it readjusts the job co-schedule continuously,
                 (iii) it evaluates a large number of possible
                 co-schedules at very low overhead, (iv) it is not
                 driven by heuristics, (v) it can optimize a performance
                 target of interest (e.g., system throughput or job
                 turnaround time), and (vi) it preserves system-level
                 priorities/shares. These innovations make symbiotic job
                 scheduling both practical and effective.\par

                 Our experimental evaluation, which assumes a realistic
                 scenario in which jobs come and go, reports an average
                 16\% (and up to 35\%) reduction in job turnaround time
                 compared to the previously proposed SOS (sample,
                 optimize, symbios) approach for a two-thread SMT
                 processor, and an average 19\% (and up to 45\%)
                 reduction in job turnaround time for a four-thread SMT
                 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "performance modeling; simultaneous multi-threading
                 (SMT); symbiotic job scheduling",
}

@Article{Shen:2010:RBV,
  author =       "Kai Shen",
  title =        "Request behavior variations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "103--116",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736034",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A large number of user requests execute (often
                 concurrently) within a server system. A single request
                 may exhibit fluctuating hardware characteristics (such
                 as instruction completion rate and on-chip resource
                 usage) over the course of its execution, due to
                 inherent variations in application execution semantics
                 as well as dynamic resource competition on
                 resource-sharing processors like multicores.
                 Understanding such behavior variations can assist
                 fine-grained request modeling and adaptive resource
                 management.\par

                 This paper presents operating system management to
                 track request behavior variations online. In addition
                 to metric sample collection during periodic interrupts,
                 we exploit the frequent system calls in server
                 applications to perform low-cost in-kernel sampling. We
                 utilize identified behavior variations to support or
                 enhance request modeling in request classification,
                 anomaly analysis, and online request signature
                 construction. A foundation of our request modeling is
                 the ability to quantify the difference between two
                 requests' time series behaviors. We evaluate several
                 differencing measures and enhance the classic dynamic
                 time warping technique with additional penalties for
                 asynchronous warp steps. Finally, motivated by
                 fluctuating request resource usage and the resulting
                 contention, we implement contention-easing CPU
                 scheduling on multicore platforms and demonstrate its
                 effectiveness in improving the worst-case request
                 performance.\par

                 Experiments in this paper are based on five server
                 applications -- Apache web server, TPCC, TPCH, RUBiS
                 online auction benchmark, and a user-content-driven
                 online teaching application called WeBWorK.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "hardware counter; multicore; operating system
                 adaptation; request modeling; server system",
}

@Article{Johnson:2010:DCM,
  author =       "F. Ryan Johnson and Radu Stoica and Anastasia Ailamaki
                 and Todd C. Mowry",
  title =        "Decoupling contention management from scheduling",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "117--128",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736035",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many parallel applications exhibit unpredictable
                 communication between threads, leading to contention
                 for shared objects. The choice of contention management
                 strategy impacts strongly the performance and
                 scalability of these applications: spinning provides
                 maximum performance but wastes significant processor
                 resources, while blocking-based approaches conserve
                 processor resources but introduce high overheads on the
                 critical path of computation. Under situations of high
                 or changing load, the operating system complicates
                 matters further with arbitrary scheduling decisions
                 which often preempt lock holders, leading to long
                 serialization delays until the preempted thread resumes
                 execution.\par

                 We observe that contention management is orthogonal to
                 the problems of scheduling and load management and
                 propose to decouple them so each may be solved
                 independently and effectively. To this end, we propose
                 a load control mechanism which manages the number of
                 active threads in the system separately from any
                 contention which may exist. By isolating contention
                 management from damaging interactions with the OS
                 scheduler, we combine the efficiency of spinning with
                 the robustness of blocking. The proposed load control
                 mechanism results in stable, high performance for both
                 lightly and heavily loaded systems, requires no special
                 privileges or modifications at the OS level, and can be
                 implemented as a library which benefits existing
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "blocking; concurrency control; contention; load
                 management; multicore; scheduling; spinning; threads",
}

@Article{Zhuravlev:2010:ASR,
  author =       "Sergey Zhuravlev and Sergey Blagodurov and Alexandra
                 Fedorova",
  title =        "Addressing shared resource contention in multicore
                 processors via scheduling",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "129--142",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736036",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Contention for shared resources on multicore
                 processors remains an unsolved problem in existing
                 systems despite significant research efforts dedicated
                 to this problem in the past. Previous solutions focused
                 primarily on hardware techniques and software page
                 coloring to mitigate this problem. Our goal is to
                 investigate how and to what extent contention for
                 shared resource can be mitigated via thread scheduling.
                 Scheduling is an attractive tool, because it does not
                 require extra hardware and is relatively easy to
                 integrate into the system. Our study is the first to
                 provide a comprehensive analysis of
                 contention-mitigating techniques that use only
                 scheduling. The most difficult part of the problem is
                 to find a classification scheme for threads, which
                 would determine how they affect each other when
                 competing for shared resources. We provide a
                 comprehensive analysis of such classification schemes
                 using a newly proposed methodology that enables to
                 evaluate these schemes separately from the scheduling
                 algorithm itself and to compare them to the optimal. As
                 a result of this analysis we discovered a
                 classification scheme that addresses not only
                 contention for cache space, but contention for other
                 shared resources, such as the memory controller, memory
                 bus and prefetching hardware. To show the applicability
                 of our analysis we design a new scheduling algorithm,
                 which we prototype at user level, and demonstrate that
                 it performs within 2\\% of the optimal. We also
                 conclude that the highest impact of contention-aware
                 scheduling techniques is not in improving performance
                 of a workload as a whole but in improving quality of
                 service or performance isolation for individual
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "multicore processors; scheduling; shared resource
                 contention",
}

@Article{Yuan:2010:SED,
  author =       "Ding Yuan and Haohui Mai and Weiwei Xiong and Lin Tan
                 and Yuanyuan Zhou and Shankar Pasupathy",
  title =        "{SherLog}: error diagnosis by connecting clues from
                 run-time logs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "143--154",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736038",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computer systems often fail due to many factors such
                 as software bugs or administrator errors. Diagnosing
                 such production run failures is an important but
                 challenging task since it is difficult to reproduce
                 them in house due to various reasons: (1)
                 unavailability of users' inputs and file content due to
                 privacy concerns; (2) difficulty in building the exact
                 same execution environment; and (3) non-determinism of
                 concurrent executions on
                 multi-processors.\par

                 Therefore, programmers often have to diagnose a
                 production run failure based on logs collected back
                 from customers and the corresponding source code. Such
                 diagnosis requires expert knowledge and is also too
                 time-consuming, tedious to narrow down root causes. To
                 address this problem, we propose a tool, called
                 SherLog, that analyzes source code by leveraging
                 information provided by run-time logs to infer what
                 must or may have happened during the failed production
                 run. It requires neither re-execution of the program
                 nor knowledge on the log's semantics. It infers both
                 control and data value information regarding to the
                 failed execution.\par

                 We evaluate SherLog with 8 representative {\em real
                 world\/} software failures (6 software bugs and 2
                 configuration errors) from 7 applications including 3
                 servers. Information inferred by SherLog are very
                 useful for programmers to diagnose these evaluated
                 failures. Our results also show that SherLog can
                 analyze large server applications such as Apache with
                 thousands of logging messages within only 40 minutes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "failure diagnostics; log; static analysis",
}

@Article{Weeratunge:2010:AMD,
  author =       "Dasarath Weeratunge and Xiangyu Zhang and Suresh
                 Jagannathan",
  title =        "Analyzing multicore dumps to facilitate concurrency
                 bug reproduction",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "155--166",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736039",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Debugging concurrent programs is difficult. This is
                 primarily because the inherent non-determinism that
                 arises because of scheduler interleavings makes it hard
                 to easily reproduce bugs that may manifest only under
                 certain interleavings. The problem is exacerbated in
                 multi-core environments where there are multiple
                 schedulers, one for each core. In this paper, we
                 propose a reproduction technique for concurrent
                 programs that execute on multi-core platforms. Our
                 technique performs a lightweight analysis of a failing
                 execution that occurs in a multi-core environment, and
                 uses the result of the analysis to enable reproduction
                 of the bug in a single-core system, under the control
                 of a deterministic scheduler.\par

                 More specifically, our approach automatically
                 identifies the execution point in the re-execution that
                 corresponds to the failure point. It does so by
                 analyzing the failure core dump and leveraging a
                 technique called {\em execution indexing\/} that
                 identifies a related point in the re-execution. By
                 generating a core dump at this point, and comparing the
                 differences between the two dumps, we are able to guide
                 a search algorithm to efficiently generate a failure
                 inducing schedule. Our experiments show that our
                 technique is highly effective and has reasonable
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "concurrency bugs; execution indexing; multi-core;
                 reproduction",
}

@Article{Burckhardt:2010:RSP,
  author =       "Sebastian Burckhardt and Pravesh Kothari and Madanlal
                 Musuvathi and Santosh Nagarakatte",
  title =        "A randomized scheduler with probabilistic guarantees
                 of finding bugs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "167--178",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736040",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a randomized scheduler for finding
                 concurrency bugs. Like current stress-testing methods,
                 it repeatedly runs a given test program with supplied
                 inputs. However, it improves on stress-testing by
                 finding buggy schedules more effectively and by
                 quantifying the probability of missing concurrency
                 bugs. Key to its design is the characterization of the
                 depth of a concurrency bug as the minimum number of
                 scheduling constraints required to find it. In a single
                 run of a program with {\em n\/} threads and {\em k\/}
                 steps, our scheduler detects a concurrency bug of depth
                 {\em d\/} with probability at least 1/ {\em
                 nk\/}$^{d-1}$. We hypothesize that in practice, many
                 concurrency bugs (including well-known types such as
                 ordering errors, atomicity violations, and deadlocks)
                 have small bug-depths, and we confirm the efficiency of
                 our schedule randomization by detecting previously
                 unknown and known concurrency bugs in several
                 production-scale concurrent programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "concurrency; race conditions; randomized algorithms;
                 testing",
}

@Article{Zhang:2010:CDS,
  author =       "Wei Zhang and Chong Sun and Shan Lu",
  title =        "{ConMem}: detecting severe concurrency bugs through an
                 effect-oriented approach",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "179--192",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736041",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multicore technology is making concurrent programs
                 increasingly pervasive. Unfortunately, it is difficult
                 to deliver reliable concurrent programs, because of the
                 huge and non-deterministic interleaving space. In
                 reality, without the resources to thoroughly check the
                 interleaving space, critical concurrency bugs can slip
                 into production runs and cause failures in the field.
                 Approaches to making the best use of the limited
                 resources and exposing severe concurrency bugs before
                 software release would be desirable.\par

                 Unlike previous work that focuses on bugs caused by
                 specific interleavings (e.g., races and
                 atomicity-violations), this paper targets concurrency
                 bugs that result in one type of severe effects: program
                 crashes. Our study of the error-propagation process of
                 realworld concurrency bugs reveals a common pattern
                 (50\% in our non-deadlock concurrency bug set) that is
                 highly correlated with program crashes. We call this
                 pattern concurrency-memory bugs: buggy interleavings
                 directly cause memory bugs (NULL-pointer-dereference,
                 dangling-pointer, buffer-overflow, uninitialized-read)
                 on shared memory objects.\par

                 Guided by this study, we built ConMem to monitor
                 program execution, analyze memory accesses and
                 synchronizations, and predicatively detect these common
                 and severe concurrency-memory bugs. We also built a
                 validator ConMem-v to automatically prune false
                 positives by enforcing potential bug-triggering
                 interleavings.\par

                 We evaluated ConMem using 7 open-source programs with 9
                 real-world severe concurrency bugs. ConMem detects more
                 tested bugs (8 out of 9 bugs) than a lock-set-based
                 race detector and an unserializable-interleaving
                 detector that detect 4 and 5 bugs respectively, with a
                 false positive rate about one tenth of the compared
                 tools. ConMem-v further prunes out all the false
                 positives. ConMem has reasonable overhead suitable for
                 development usage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "concurrency bugs; software testing",
}

@Article{Mesa-Martinez:2010:CPT,
  author =       "Francisco Javier Mesa-Martinez and Ehsan K. Ardestani
                 and Jose Renau",
  title =        "Characterizing processor thermal behavior",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "193--204",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736043",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Temperature is a dominant factor in the performance,
                 reliability, and leakage power consumption of modern
                 processors. As a result, increasing numbers of
                 researchers evaluate thermal characteristics in their
                 proposals. In this paper, we measure a real processor
                 focusing on its thermal characterization executing
                 diverse workloads.\par

                 Our results show that in real designs, thermal
                 transients operate at larger scales than their
                 performance and power counterparts. Conventional
                 thermal simulation methodologies based on profile-based
                 simulation or statistical sampling, such as Simpoint,
                 tend to explore very limited execution spans. Short
                 simulation times can lead to reduced matchings between
                 performance and thermal phases. To illustrate these
                 issues we characterize and classify from a thermal
                 standpoint SPEC00 and SPEC06 applications, which are
                 traditionally used in the evaluation of architectural
                 proposals. This paper concludes with a list of
                 recommendations regarding thermal modeling
                 considerations based on our experimental insights.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "microarchitecture; temperature; thermal simulation",
}

@Article{Venkatesh:2010:CCR,
  author =       "Ganesh Venkatesh and Jack Sampson and Nathan Goulding
                 and Saturnino Garcia and Vladyslav Bryksin and Jose
                 Lugo-Martinez and Steven Swanson and Michael Bedford
                 Taylor",
  title =        "Conservation cores: reducing the energy of mature
                 computations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "205--218",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736044",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Growing transistor counts, limited power budgets, and
                 the breakdown of voltage scaling are currently
                 conspiring to create a {\em utilization wall\/} that
                 limits the fraction of a chip that can run at full
                 speed at one time. In this regime, specialized,
                 energy-efficient processors can increase parallelism by
                 reducing the per-computation power requirements and
                 allowing more computations to execute under the same
                 power budget. To pursue this goal, this paper
                 introduces {\em conservation cores}. Conservation
                 cores, or {\em c-cores}, are specialized processors
                 that focus on reducing energy and energy-delay instead
                 of increasing performance. This focus on energy makes
                 c-cores an excellent match for many applications that
                 would be poor candidates for hardware acceleration
                 (e.g., irregular integer codes). We present a toolchain
                 for automatically synthesizing c-cores from application
                 source code and demonstrate that they can significantly
                 reduce energy and energy-delay for a wide range of
                 applications. The c-cores support patching, a form of
                 targeted reconfigurability, that allows them to adapt
                 to new versions of the software they target. Our
                 results show that conservation cores can reduce energy
                 consumption by up to 16.0x for functions and by up to
                 2.1x for whole applications, while patching can extend
                 the useful lifetime of individual c-cores to match that
                 of conventional processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "conservation core; heterogeneous many-core; patching;
                 utilization wall",
}

@Article{Sudan:2010:MPI,
  author =       "Kshitij Sudan and Niladrish Chatterjee and David
                 Nellans and Manu Awasthi and Rajeev Balasubramonian and
                 Al Davis",
  title =        "Micro-pages: increasing {DRAM} efficiency with
                 locality-aware data placement",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "219--230",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736045",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Power consumption and DRAM latencies are serious
                 concerns in modern chip-multiprocessor (CMP or
                 multi-core) based compute systems. The management of
                 the DRAM row buffer can significantly impact both power
                 consumption and latency. Modern DRAM systems read data
                 from cell arrays and populate a row buffer as large as
                 8 KB on a memory request. But only a small fraction of
                 these bits are ever returned back to the CPU. This ends
                 up wasting energy and time to read (and subsequently
                 write back) bits which are used rarely. Traditionally,
                 an open-page policy has been used for uni-processor
                 systems and it has worked well because of spatial and
                 temporal locality in the access stream. In future
                 multi-core processors, the possibly independent access
                 streams of each core are interleaved, thus destroying
                 the available locality and significantly
                 under-utilizing the contents of the row buffer. In this
                 work, we attempt to improve row-buffer utilization for
                 future multi-core systems.\par

                 The schemes presented here are motivated by our
                 observations that a large number of accesses within
                 heavily accessed OS pages are to small, contiguous
                 'chunks' of cache blocks. Thus, the co-location of
                 chunks (from different OS pages) in a row-buffer will
                 improve the overall utilization of the row buffer
                 contents, and consequently reduce memory energy
                 consumption and access time. Such co-location can be
                 achieved in many ways, notably involving a reduction in
                 OS page size and software or hardware assisted
                 migration of data within DRAM. We explore these
                 mechanisms and discuss the trade-offs involved along
                 with energy and performance improvements from each
                 scheme. On average, for applications with room for
                 improvement, our best performing scheme increases
                 performance by 9\% (max. 18\%) and reduces memory
                 energy consumption by 15\% (max. 70\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "data placement; dram row-buffer management",
}

@Article{Pelley:2010:PRD,
  author =       "Steven Pelley and David Meisner and Pooya Zandevakili
                 and Thomas F. Wenisch and Jack Underwood",
  title =        "Power routing: dynamic power provisioning in the data
                 center",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "231--242",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735971.1736047",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data center power infrastructure incurs massive
                 capital costs, which typically exceed energy costs over
                 the life of the facility. To squeeze maximum value from
                 the infrastructure, researchers have proposed
                 over-subscribing power circuits, relying on the
                 observation that peak loads are rare. To ensure
                 availability, these proposals employ power capping,
                 which throttles server performance during utilization
                 spikes to enforce safe power budgets. However, because
                 budgets must be enforced locally -- at each power
                 distribution unit (PDU) -- local utilization spikes may
                 force throttling even when power delivery capacity is
                 available elsewhere. Moreover, the need to maintain
                 reserve capacity for fault tolerance on power delivery
                 paths magnifies the impact of utilization
                 spikes.\par

                 In this paper, we develop mechanisms to better utilize
                 installed power infrastructure, reducing reserve
                 capacity margins and avoiding performance throttling.
                 Unlike conventional high-availability data centers,
                 where collocated servers share identical primary and
                 secondary power feeds, we reorganize power feeds to
                 create shuffled power distribution topologies. Shuffled
                 topologies spread secondary power feeds over numerous
                 PDUs, reducing reserve capacity requirements to
                 tolerate a single PDU failure. Second, we propose Power
                 Routing, which schedules IT load dynamically across
                 redundant power feeds to: (1) shift slack to servers
                 with growing power demands, and (2) balance power draw
                 across AC phases to reduce heating and improve
                 electrical stability. We describe efficient heuristics
                 for scheduling servers to PDUs (an NP-complete
                 problem). Using data collected from nearly 1000 servers
                 in three production facilities, we demonstrate that
                 these mechanisms can reduce the required power
                 infrastructure capacity relative to conventional
                 high-availability data centers by 32\% without
                 performance degradation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "data centers; power infrastructure",
}

@Article{Ahmad:2010:JOI,
  author =       "Faraz Ahmad and T. N. Vijaykumar",
  title =        "Joint optimization of idle and cooling power in data
                 centers while maintaining response time",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "243--256",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735971.1736048",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Server power and cooling power amount to a significant
                 fraction of modern data centers' recurring costs. While
                 data centers provision enough servers to guarantee
                 response times under the maximum loading, data centers
                 operate under much less loading most of the times
                 (e.g., 30-70\% of the maximum loading). Previous
                 server-power proposals exploit this under-utilization
                 to reduce the server idle power by keeping active only
                 as many servers as necessary and putting the rest into
                 low-power standby modes. However, these proposals incur
                 higher cooling power due to hot spots created by
                 concentrating the data center loading on fewer active
                 servers, or degrade response times due to
                 standby-to-active transition delays, or both. Other
                 proposals optimize the cooling power but incur
                 considerable idle power. To address the first issue of
                 power, we propose {\em PowerTrade}, which trades-off
                 idle power and cooling power for each other, thereby
                 reducing the total power. To address the second issue
                 of response time, we propose {\em SurgeGuard\/} to
                 overprovision the number of active servers beyond that
                 needed by the current loading so as to absorb future
                 increases in the loading. SurgeGuard is a two-tier
                 scheme which uses well-known over-provisioning at
                 coarse time granularities (e.g., one hour) to absorb
                 the common, smooth increases in the loading, and a
                 novel fine-grain replenishment of the over-provisioned
                 reserves at fine time granularities (e.g., five
                 minutes) to handle the uncommon, abrupt loading surges.
                 Using real-world traces, we show that combining
                 PowerTrade and SurgeGuard reduces total power by 30\%
                 compared to previous low-power schemes while
                 maintaining response times within 1.7\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "cooling power; data center; idle power; power
                 management; response time",
}

@Article{Goodstein:2010:BAA,
  author =       "Michelle L. Goodstein and Evangelos Vlachos and Shimin
                 Chen and Phillip B. Gibbons and Michael A. Kozuch and
                 Todd C. Mowry",
  title =        "Butterfly analysis: adapting dataflow analysis to
                 dynamic parallel monitoring",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "257--270",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735971.1736050",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Online program monitoring is an effective technique
                 for detecting bugs and security attacks in running
                 applications. Extending these tools to monitor parallel
                 programs is challenging because the tools must account
                 for inter-thread dependences and relaxed memory
                 consistency models. Existing tools assume sequential
                 consistency and often slow down the monitored program
                 by orders of magnitude. In this paper, we present a
                 novel approach that avoids these pitfalls by not
                 relying on strong consistency models or detailed
                 inter-thread dependence tracking. Instead, we only
                 assume that events in the distant past on all threads
                 have become visible; we make no assumptions on (and
                 avoid the overheads of tracking) the relative ordering
                 of more recent events on other threads. To overcome the
                 potential state explosion of considering all the
                 possible orderings among recent events, we adapt two
                 techniques from static dataflow analysis, reaching
                 definitions and reaching expressions, to this new
                 domain of dynamic parallel monitoring. Significant
                 modifications to these techniques are proposed to
                 ensure the correctness and efficiency of our approach.
                 We show how our adapted analysis can be used in two
                 popular memory and security tools. We prove that our
                 approach does not miss errors, and sacrifices precision
                 only due to the lack of a relative ordering among
                 recent events. Moreover, our simulation study on a
                 collection of Splash-2 and Parsec 2.0 benchmarks
                 running a memory-checking tool on a hardware-assisted
                 logging platform demonstrates the potential benefits in
                 trading off a very low false positive rate for (i)
                 reduced overhead and (ii) the ability to run on relaxed
                 consistency models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "data flow analysis; dynamic program monitoring;
                 parallel programming; static analysis",
}

@Article{Vlachos:2010:PEA,
  author =       "Evangelos Vlachos and Michelle L. Goodstein and
                 Michael A. Kozuch and Shimin Chen and Babak Falsafi and
                 Phillip B. Gibbons and Todd C. Mowry",
  title =        "{ParaLog}: enabling and accelerating online parallel
                 monitoring of multithreaded applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "271--284",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736051",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "{\em Instruction-grain lifeguards\/} monitor the
                 events of a running application at the level of
                 individual instructions in order to identify and help
                 mitigate application bugs and security exploits.
                 Because such lifeguards impose a 10-100X slowdown on
                 existing platforms, previous studies have proposed
                 hardware designs to accelerate lifeguard processing.
                 However, these accelerators are either tailored to a
                 specific class of lifeguards or suitable only for
                 monitoring single-threaded programs.\par

                 We present ParaLog, the first design of a system
                 enabling fast online parallel monitoring of
                 multithreaded parallel applications. ParaLog supports a
                 broad class of software-defined lifeguards. We show how
                 three existing accelerators can be enhanced to support
                 online multithreaded monitoring, dramatically reducing
                 lifeguard overheads. We identify and solve several
                 challenges in monitoring parallel applications and/or
                 parallelizing these accelerators, including (i)
                 enforcing inter-thread data dependences, (ii) dealing
                 with inter-thread effects that are not reflected in
                 coherence traffic, (iii) dealing with unmonitored
                 operating system activity, and (iv) ensuring lifeguards
                 can access shared metadata with negligible
                 synchronization overheads. We present our system design
                 for both Sequentially Consistent and Total Store
                 Ordering processors. We implement and evaluate our
                 design on a 16 core simulated CMP, using benchmarks
                 from SPLASH-2 and PARSEC and two lifeguards: a
                 data-flow tracking lifeguard and a memory-access
                 checker lifeguard. Our results show that (i) our
                 parallel accelerators improve performance by 2-9X and
                 1.13-3.4X for our two lifeguards, respectively, (ii) we
                 are 5-126X faster than the time-slicing approach
                 required by existing techniques, and (iii) our average
                 overheads for applications with eight threads are 51\%
                 and 28\% for the two lifeguards, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "hardware support for debugging; instruction-grain
                 lifeguards; online parallel monitoring",
}

@Article{Hormati:2010:MMS,
  author =       "Amir H. Hormati and Yoonseo Choi and Mark Woh and
                 Manjunath Kudlur and Rodric Rabbah and Trevor Mudge and
                 Scott Mahlke",
  title =        "{MacroSS}: macro-{SIMD}ization of streaming
                 applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "285--296",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736053",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "SIMD (Single Instruction, Multiple Data) engines are
                 an essential part of the processors in various
                 computing markets, from servers to the embedded domain.
                 Although SIMD-enabled architectures have the capability
                 of boosting the performance of many application domains
                 by exploiting data-level parallelism, it is very
                 challenging for compilers and also programmers to
                 identify and transform parts of a program that will
                 benefit from a particular SIMD engine. The focus of
                 this paper is on the problem of SIMDization for the
                 growing application domain of streaming. Streaming
                 applications are an ideal solution for targeting
                 multi-core architectures, such as shared/distributed
                 memory systems, tiled architectures, and single-core
                 systems. Since these architectures, in most cases,
                 provide SIMD acceleration units as well, it is highly
                 beneficial to generate SIMD code from streaming
                 programs. Specifically, we introduce MacroSS, which is
                 capable of performing macro-SIMDization on high-level
                 streaming graphs. Macro-SIMDization uses high-level
                 information such as execution rates of actors and
                 communication patterns between them to transform the
                 graph structure, vectorize actors of a streaming
                 program, and generate intermediate code. We also
                 propose low-overhead architectural modifications that
                 accelerate shuffling of data elements between the
                 scalar and vectorized parts of a streaming program. Our
                 experiments show that MacroSS is capable of generating
                 code that, on average, outperforms scalar code compiled
                 with the current state-of-art auto-vectorizing
                 compilers by 54\%. Using the low-overhead data
                 shuffling hardware, performance is improved by an
                 additional 8\% with less than 1\% area overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compiler; optimization; SIMD architecture; streaming",
}

@Article{Woo:2010:CPD,
  author =       "Dong Hyuk Woo and Hsien-Hsin S. Lee",
  title =        "{COMPASS}: a programmable data prefetcher using idle
                 {GPU} shaders",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "297--310",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735971.1736054",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A traditional fixed-function graphics accelerator has
                 evolved into a programmable general-purpose graphics
                 processing unit over the last few years. These powerful
                 computing cores are mainly used for accelerating
                 graphics applications or enabling low-cost scientific
                 computing. To further reduce the cost and form factor,
                 an emerging trend is to integrate GPU along with the
                 memory controllers onto the same die with the processor
                 cores. However, given such a system-on-chip, the GPU,
                 while occupying a substantial part of the silicon, will
                 sit idle and contribute nothing to the overall system
                 performance when running non-graphics workloads or
                 applications lack of data-level parallelism. In this
                 paper, we propose COMPASS, a compute shader-assisted
                 data prefetching scheme, to leverage the GPU resource
                 for improving single-threaded performance on an
                 integrated system. By harnessing the GPU shader cores
                 with very lightweight architectural support, COMPASS
                 can emulate the functionality of a hardware-based
                 prefetcher using the idle GPU and successfully improve
                 the memory performance of single-thread applications.
                 Moreover, thanks to its flexibility and
                 programmability, one can implement the best performing
                 prefetch scheme to improve each specific application as
                 demonstrated in this paper. With COMPASS, we envision
                 that a future application vendor can provide a
                 custom-designed COMPASS shader bundled with its
                 software to be loaded at runtime to optimize the
                 performance. Our simulation results show that COMPASS
                 can improve the single-thread performance of
                 memory-intensive applications by 68\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compute shader; GPU; prefetch",
}

@Article{Sanchez:2010:FAS,
  author =       "Daniel Sanchez and Richard M. Yoo and Christos
                 Kozyrakis",
  title =        "Flexible architectural support for fine-grain
                 scheduling",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "311--322",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736055",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To make efficient use of CMPs with tens to hundreds of
                 cores, it is often necessary to exploit fine-grain
                 parallelism. However, managing tasks of a few thousand
                 instructions is particularly challenging, as the
                 runtime must ensure load balance without compromising
                 locality and introducing small overheads. Software-only
                 schedulers can implement various scheduling algorithms
                 that match the characteristics of different
                 applications and programming models, but suffer
                 significant overheads as they synchronize and
                 communicate task information over the deep cache
                 hierarchy of a large-scale CMP. To reduce these costs,
                 hardware-only schedulers like Carbon, which implement
                 task queuing and scheduling in hardware, have been
                 proposed. However, a hardware-only solution fixes the
                 scheduling algorithm and leaves no room for other uses
                 of the custom hardware.\par

                 This paper presents a combined hardware-software
                 approach to build fine-grain schedulers that retain the
                 flexibility of software schedulers while being as fast
                 and scalable as hardware ones. We propose asynchronous
                 direct messages (ADM), a simple architectural extension
                 that provides direct exchange of asynchronous, short
                 messages between threads in the CMP without going
                 through the memory hierarchy. ADM is sufficient to
                 implement a family of novel, software-mostly schedulers
                 that rely on low-overhead messaging to efficiently
                 coordinate scheduling and transfer task information.
                 These schedulers match and often exceed the performance
                 and scalability of Carbon when using the same
                 scheduling algorithm. When the ADM runtime tailors its
                 scheduling algorithm to application characteristics, it
                 outperforms Carbon by up to 70\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "chip-multiprocessors; fine-grain scheduling;
                 many-core; messaging; scheduling; work-stealing",
}

@Article{Romanescu:2010:SDV,
  author =       "Bogdan F. Romanescu and Alvin R. Lebeck and Daniel J.
                 Sorin",
  title =        "Specifying and dynamically verifying address
                 translation-aware memory consistency",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "323--334",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736057",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computer systems with virtual memory are susceptible
                 to design bugs and runtime faults in their address
                 translation (AT) systems. Detecting bugs and faults
                 requires a clear specification of correct behavior. To
                 address this need, we develop a framework for AT-aware
                 memory consistency models. We expand and divide memory
                 consistency into the physical address memory
                 consistency (PAMC) model that defines the behavior of
                 operations on physical addresses and the virtual
                 address memory consistency (VAMC) model that defines
                 the behavior of operations on virtual addresses. As
                 part of this expansion, we show what AT features are
                 required to bridge the gap between PAMC and VAMC. Based
                 on our AT-aware memory consistency specifications, we
                 design efficient dynamic verification hardware that can
                 detect violations of VAMC and thus detect the effects
                 of design bugs and runtime faults, including most AT
                 related bugs in published errata.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "address translation; dynamic verification; memory
                 consistency; virtual memory",
}

@Article{Ebrahimi:2010:FST,
  author =       "Eiman Ebrahimi and Chang Joo Lee and Onur Mutlu and
                 Yale N. Patt",
  title =        "Fairness via source throttling: a configurable and
                 high-performance fairness substrate for multi-core
                 memory systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "335--346",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736058",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cores in a chip-multiprocessor (CMP) system share
                 multiple hardware resources in the memory subsystem. If
                 resource sharing is unfair, some applications can be
                 delayed significantly while others are unfairly
                 prioritized. Previous research proposed separate
                 fairness mechanisms in each individual resource. Such
                 resource-based fairness mechanisms implemented
                 independently in each resource can make contradictory
                 decisions, leading to low fairness and loss of
                 performance. Therefore, a coordinated mechanism that
                 provides fairness in the entire shared memory system is
                 desirable.\par

                 This paper proposes a new approach that provides
                 fairness in the {\em entire shared memory system},
                 thereby eliminating the need for and complexity of
                 developing fairness mechanisms for each individual
                 resource. Our technique, Fairness via Source Throttling
                 (FST), estimates the unfairness in the entire shared
                 memory system. If the estimated unfairness is above a
                 threshold set by system software, FST throttles down
                 cores causing unfairness by limiting the number of
                 requests they can inject into the system and the
                 frequency at which they do. As such, our {\em
                 source-based\/} fairness control ensures fairness
                 decisions are made in tandem in the entire memory
                 system. FST also enforces thread priorities/weights,
                 and enables system software to enforce different
                 fairness objectives and fairness-performance tradeoffs
                 in the memory system.\par

                 Our evaluations show that FST provides the best system
                 fairness and performance compared to four systems with
                 no fairness control and with state-of-the-art fairness
                 mechanisms implemented in both shared caches and memory
                 controllers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "fairness; multi-core systems; shared memory systems;
                 system performance",
}

@Article{Gelado:2010:ADS,
  author =       "Isaac Gelado and Javier Cabezas and Nacho Navarro and
                 John E. Stone and Sanjay Patel and Wen-mei W. Hwu",
  title =        "An asymmetric distributed shared memory model for
                 heterogeneous parallel systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "347--358",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736059",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous computing combines general purpose CPUs
                 with accelerators to efficiently execute both
                 sequential control-intensive and data-parallel phases
                 of applications. Existing programming models for
                 heterogeneous computing rely on programmers to
                 explicitly manage data transfers between the CPU system
                 memory and accelerator memory.\par

                 This paper presents a new programming model for
                 heterogeneous computing, called Asymmetric Distributed
                 Shared Memory (ADSM), that maintains a shared logical
                 memory space for CPUs to access objects in the
                 accelerator physical memory but not vice versa. The
                 asymmetry allows light-weight implementations that
                 avoid common pitfalls of symmetrical distributed shared
                 memory systems. ADSM allows programmers to assign data
                 objects to performance critical methods. When a method
                 is selected for accelerator execution, its associated
                 data objects are allocated within the shared logical
                 memory space, which is hosted in the accelerator
                 physical memory and transparently accessible by the
                 methods executed on CPUs.\par

                 We argue that ADSM reduces programming efforts for
                 heterogeneous computing systems and enhances
                 application portability. We present a software
                 implementation of ADSM, called GMAC, on top of CUDA in
                 a GNU/Linux environment. We show that applications
                 written in ADSM and running on top of GMAC achieve
                 performance comparable to their counterparts using
                 programmer-managed data transfers. This paper presents
                 the GMAC system and evaluates different design choices.
                 We further suggest additional architectural support
                 that will likely allow GMAC to achieve higher
                 application performance than the current CUDA model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "asymmetric distributed shared memory; data-centric
                 programming models; heterogeneous systems",
}

@Article{Bhattacharjee:2010:ICC,
  author =       "Abhishek Bhattacharjee and Margaret Martonosi",
  title =        "Inter-core cooperative {TLB} for chip
                 multiprocessors",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "359--370",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735970.1736060",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Translation Lookaside Buffers (TLBs) are commonly
                 employed in modern processor designs and have
                 considerable impact on overall system performance. A
                 number of past works have studied TLB designs to lower
                 access times and miss rates, specifically for
                 uniprocessors. With the growing dominance of chip
                 multiprocessors (CMPs), it is necessary to examine TLB
                 performance in the context of parallel
                 workloads.\par

                 This work is the first to present TLB prefetchers that
                 exploit commonality in TLB miss patterns across cores
                 in CMPs. We propose and evaluate two Inter-Core
                 Cooperative (ICC) TLB prefetching mechanisms, assessing
                 their effectiveness at eliminating TLB misses both
                 individually and together. Our results show these
                 approaches require at most modest hardware and can
                 collectively eliminate 19\% to 90\% of data TLB (D-TLB)
                 misses across the surveyed parallel workloads.\par

                 We also compare performance improvements across a range
                 of hardware and software implementation possibilities.
                 We find that while a fully-hardware implementation
                 results in average performance improvements of 8-46\%
                 for a range of TLB sizes, a hardware/software approach
                 yields improvements of 4-32\%. Overall, our work shows
                 that TLB prefetchers exploiting inter-core correlations
                 can effectively eliminate TLB misses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "parallelism; prefetching; translation lookaside
                 buffer",
}

@Article{Huang:2010:OES,
  author =       "Ruirui Huang and Daniel Y. Deng and G. Edward Suh",
  title =        "{Orthrus}: efficient software integrity protection on
                 multi-cores",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "371--384",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736062",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper proposes an efficient hardware/software
                 system that significantly enhances software security
                 through diversified replication on multi-cores. Recent
                 studies show that a large class of software attacks can
                 be detected by running multiple versions of a program
                 simultaneously and checking the consistency of their
                 behaviors. However, execution of multiple replicas
                 incurs significant overheads on today's computing
                 platforms, especially with fine-grained comparisons
                 necessary for high security. Orthrus exploits
                 similarities in automatically generated replicas to
                 enable simultaneous execution of those replicas with
                 minimal overheads; the architecture reduces memory and
                 bandwidth overheads by compressing multiple memory
                 spaces together, and additional power consumption and
                 silicon area by eliminating redundant computations.
                 Utilizing the hardware architecture, Orthrus implements
                 a fine-grained memory layout diversification with the
                 LLVM compiler and can detect corruptions in both
                 pointers and critical data. Experiments indicate that
                 the Orthrus architecture incurs minimal overheads and
                 provides a protection against a broad range of
                 attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "memory protection; multi-core architecture;
                 replication-aware architecture; software diversity and
                 redundancy; software security",
}

@Article{Feng:2010:SPS,
  author =       "Shuguang Feng and Shantanu Gupta and Amin Ansari and
                 Scott Mahlke",
  title =        "Shoestring: probabilistic soft error reliability on
                 the cheap",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "385--396",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736063",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Aggressive technology scaling provides designers with
                 an ever increasing budget of cheaper and faster
                 transistors. Unfortunately, this trend is accompanied
                 by a decline in individual device reliability as
                 transistors become increasingly susceptible to soft
                 errors. We are quickly approaching a new era where
                 resilience to soft errors is no longer a luxury that
                 can be reserved for just processors in
                 high-reliability, mission-critical domains. Even
                 processors used in mainstream computing will soon
                 require protection. However, due to tighter profit
                 margins, reliable operation for these devices must come
                 at little or no cost. This paper presents Shoestring, a
                 minimally invasive software solution that provides high
                 soft error coverage with very little overhead, enabling
                 its deployment even in commodity processors with
                 'shoestring' reliability budgets. Leveraging
                 intelligent analysis at compile time, and exploiting
                 low-cost, symptom-based error detection, Shoestring is
                 able to focus its efforts on protecting
                 statistically-vulnerable portions of program code.
                 Shoestring effectively applies instruction duplication
                 to protect only those segments of code that, when
                 subjected to a soft error, are likely to result in
                 user-visible faults without first exhibiting
                 symptomatic behavior. Shoestring is able to recover
                 from an additional 33.9\% of soft errors that are
                 undetected by a symptom-only approach, achieving an
                 overall user-visible failure rate of 1.6\%. This
                 reliability improvement comes at a modest performance
                 overhead of 15.8\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compiler analysis; error detection; fault injection",
}

@Article{Yoon:2010:VFE,
  author =       "Doe Hyun Yoon and Mattan Erez",
  title =        "Virtualized and flexible {ECC} for main memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "3",
  pages =        "397--408",
  month =        mar,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1736020.1736064",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Mar 17 13:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a general scheme for virtualizing main
                 memory error-correction mechanisms, which map redundant
                 information needed to correct errors into the memory
                 namespace itself. We rely on this basic idea, which
                 increases flexibility to increase error protection
                 capabilities, improve power efficiency, and reduce
                 system cost; with only small performance overheads. We
                 augment the virtual memory system architecture to
                 detach the physical mapping of data from the physical
                 mapping of its associated ECC information. We then use
                 this mechanism to develop two-tiered error protection
                 techniques that separate the process of detecting
                 errors from the rare need to also correct errors, and
                 thus save energy. We describe how to provide strong
                 chipkill and double-chip kill protection using existing
                 DRAM and packaging technology. We show how to maintain
                 access granularity and redundancy overheads, even when
                 using $\times 8$ DRAM chips. We also evaluate error
                 correction for systems that do not use ECC DIMMs.
                 Overall, analysis of demanding SPEC CPU 2006 and PARSEC
                 benchmarks indicates that performance overhead is only
                 1\% with ECC DIMMs and less than 10\% using standard
                 Non-ECC DIMM configurations, that DRAM power savings
                 can be as high as 27\%, and that the system
                 energy-delay product is improved by 12\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "error correction; fault tolerance; memory systems;
                 reliability",
}

@Article{Li:2010:AAB,
  author =       "Minming Li and Chun Jason Xue and Tiantian Liu and
                 Yingchao Zhao",
  title =        "Analysis and approximation for bank selection
                 instruction minimization on partitioned memory
                 architecture",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "1--8",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755888.1755890",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A large number of embedded systems include 8-bit
                 microcontrollers for their energy efficiency and low
                 cost. Multi-bank memory architecture is commonly
                 applied in 8-bit microcontrollers to increase the size
                 of memory without extending address buses. To switch
                 among different memory banks, a special instruction,
                 Bank Selection, is used. How to minimize the number of
                 bank selection instructions inserted is important to
                 reduce code size for embedded systems.\par

                 In this paper, we consider how to insert the minimum
                 number of bank selection instructions in a program to
                 achieve feasibility. A program can be represented by a
                 control flow graph (CFG). We prove that it is NP-Hard
                 to insert the minimum number of bank selection
                 instructions if all the variables are pre-assigned to
                 memory banks. Therefore, we introduce a 2-approximation
                 algorithm using a rounding method. When the CFG is a
                 tree or the out-degree of each node in the CFG is at
                 most two, we show that we can insert the bank selection
                 instructions optimally in polynomial time. We then
                 consider the case when there are some nodes that do not
                 access any memory bank and design a dynamic programming
                 method to compute the optimal insertion strategy when
                 the CFG is a tree. Experimental result shows the
                 proposed techniques can reduce bank selection
                 instructions significantly on partitioned memory
                 architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "bank selection instruction minimization; partitioned
                 memory architecture",
}

@Article{Pyka:2010:VSL,
  author =       "Robert Pyka and Felipe Klein and Peter Marwedel and
                 Stylianos Mamagkakis",
  title =        "Versatile system-level memory-aware platform
                 description approach for embedded {MPSoCs}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "9--16",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755888.1755891",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we present a novel system modeling
                 language which targets primarily the development of
                 source-level multiprocessor memory aware
                 optimizations.\par

                 In contrast to previous system modeling approaches this
                 approach tries to model the whole system and especially
                 the memory hierarchy in a structural and semantically
                 accessible way. Previous approaches primarily support
                 generation of simulators or retargetable code selectors
                 and thus concentrate on pure behavioral models or
                 describe only the processor instruction set in a
                 semantically accessible way, A simple, database-like,
                 interface is offered to the optimization developer,
                 which in conjunction with the MACCv2 framework enables
                 rapid development of source-level architecture
                 independent optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "architecture description; channel; component;
                 configuration; definition; energy models; framework",
}

@Article{Kim:2010:ODM,
  author =       "Yongjoo Kim and Jongeun Lee and Aviral Shrivastava and
                 Yunheung Paek",
  title =        "Operation and data mapping for {CGRAs} with multi-bank
                 memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "17--26",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755892",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Coarse Grain Reconfigurable Architectures (CGRAs)
                 promise high performance at high power efficiency. They
                 fulfill this promise by keeping the hardware extremely
                 simple, and moving the complexity to application
                 mapping. One major challenge comes in the form of data
                 mapping. For reasons of power-efficiency and
                 complexity, CGRAs use multi-bank local memory, and a
                 row of PEs share memory access. In order for each row
                 of the PEs to access any memory bank, there is a
                 hardware arbiter between the memory requests generated
                 by the PEs and the banks of the local memory. However,
                 a fundamental restriction remains that a bank cannot be
                 accessed by two different PEs at the same time. We
                 propose to meet this challenge by mapping application
                 operations onto PEs and data into memory banks in a way
                 that avoids such conflicts. Our experimental results on
                 kernels from multimedia benchmarks demonstrate that our
                 local memory-aware compilation approach can generate
                 mappings that are up to 40\% better in performance
                 (17.3\% on average) compared to a memory-unaware
                 scheduler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "arbiter; bank conflict; coarse-grained reconfigurable
                 architecture; compilation; multi-bank memory",
}

@Article{Foroozannejad:2010:LDB,
  author =       "Mohammad H. Foroozannejad and Matin Hashemi and Trevor
                 L. Hodges and Soheil Ghiasi",
  title =        "Look into details: the benefits of fine-grain
                 streaming buffer analysis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "27--36",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755894",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many embedded applications demand processing of a
                 seemingly endless stream of input data in real-time.
                 Productive development of such applications is
                 typically carried out by synthesizing software from
                 high-level specifications, such as data-flow graphs. In
                 this context, we study the problem of inter-actor
                 buffer allocation, which is a critical step during
                 compilation of streaming applications. We argue that
                 fine-grain analysis of buffers' spatio-temporal
                 characteristics, as opposed to conventional live range
                 analysis, enables dramatic improvements in buffer
                 sharing. Improved sharing translates to reduction of
                 the compiled binary memory footprint, which is of prime
                 concern in many embedded systems. We transform the
                 buffer allocation problem to two-dimensional packing
                 using complex polygons. We develop an evolutionary
                 packing algorithm, which readily yields buffer
                 allocations. Experimental results show an average of
                 over 7X and 2X improvement in total buffer size,
                 compared to baseline and conventional live range
                 analysis schemes, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "buffer management; optimization; software synthesis;
                 streaming applications; synchronous data flow",
}

@Article{Perathoner:2010:MSE,
  author =       "Simon Perathoner and Tobias Rein and Lothar Thiele and
                 Kai Lampka and Jonas Rox",
  title =        "Modeling structured event streams in system level
                 performance analysis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "37--46",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755895",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper extends the methodology of analytic
                 real-time analysis of distributed embedded systems
                 towards merging and extracting sub-streams based on
                 event type information. For example, one may first
                 merge a set of given event streams, then process them
                 jointly and finally decompose them into separate
                 streams again. In other words, data streams can be
                 hierarchically composed into higher level event streams
                 and decomposed later on again. The proposed technique
                 is strictly compositional, hence highly suited for
                 being embedded into well known performance evaluation
                 frameworks such as Symta/S and MPA (Modular Performance
                 Analysis). It is based on a novel characterization of
                 structured event streams which we denote as Event Count
                 Curves. They characterize the structure of event
                 streams in which the individual events belong to a
                 finite number of classes. This new concept avoids the
                 explicit maintenance of stream-individual information
                 when routing a composed stream through a network of
                 system components. Nevertheless it allows an arbitrary
                 composition and decomposition of sub-streams at any
                 stage of the distributed event processing. For
                 evaluating our approach we analyze a realistic
                 case-study and compare the obtained results with other
                 existing techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "event count curves; performance analysis; real-time
                 calculus",
}

@Article{Brandt:2010:TCA,
  author =       "Jens Brandt and Klaus Schneider and Sandeep K.
                 Shukla",
  title =        "Translating concurrent action oriented specifications
                 to synchronous guarded actions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "47--56",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755888.1755896",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent Action-Oriented Specifications (CAOS) model
                 the behavior of a synchronous hardware circuit as
                 asynchronous guarded actions at an abstraction level
                 higher than the Register Transfer Level (RTL). Previous
                 approaches always considered the compilation of CAOS,
                 which includes a transformation of the under-lying
                 model of computation and the scheduling of guarded
                 actions per clock cycle, as a tightly integrated step.
                 In this paper, we present a new compilation procedure,
                 which separates these two tasks and translates CAOS
                 models to synchronous guarded actions with an explicit
                 interface to a scheduler. This separation of concerns
                 has many advantages, including better analyses and
                 integration of custom schedulers. Our method also
                 generates assertions that each scheduler must obey that
                 can be fulfilled by algorithms for scheduler synthesis
                 like those developed in supervisory control. We present
                 our translation procedure in detail and illustrate it
                 by various examples. We also show that our method
                 simplifies formal verification of hardware synthesized
                 from CAOS specifications over previously known formal
                 verification approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "code generation; concurrent action-oriented
                 specifications; guarded commands; synchronous
                 languages",
}

@Article{Delaval:2010:CMD,
  author =       "Gwena{\"e}l Delaval and Herv{\'e} Marchand and Eric
                 Rutten",
  title =        "Contracts for modular discrete controller synthesis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "57--66",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755898",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe the extension of a reactive programming
                 language with a behavioral contract construct. It is
                 dedicated to the programming of reactive control of
                 applications in embedded systems, and involves
                 principles of the supervisory control of discrete event
                 systems. Our contribution is in a language approach
                 where modular discrete controller synthesis (DCS) is
                 integrated, and it is concretized in the encapsulation
                 of DCS into a compilation process. From transition
                 system specifications of possible behaviors, DCS
                 automatically produces controllers that make the
                 controlled system satisfy the property given as
                 objective. Our language features and compiling
                 technique provide correctness-by-construction in that
                 sense, and enhance reliability and verifiability. Our
                 application domain is adaptive and reconfigurable
                 systems: closed-loop adaptation mechanisms enable
                 flexible execution of functionalities w.r.t. changing
                 resource and environment conditions. Our language can
                 serve programming such adaption controllers. This paper
                 particularly describes the compilation of the language.
                 We present a method for the modular application of
                 discrete controller synthesis on synchronous programs,
                 and its integration in the BZR language. We consider
                 structured programs, as a composition of nodes, and
                 first apply DCS on particular nodes of the program, in
                 order to reduce the complexity of the controller
                 computation; then, we allow the abstraction of parts of
                 the program for this computation; and finally, we show
                 how to recompose the different controllers computed
                 from different abstractions for their correct
                 co-execution with the initial program. Our work is
                 illustrated with examples, and we present quantitative
                 results about its implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "adaptive and reconfigurable systems; components;
                 contracts; discrete controller synthesis; modularity;
                 reactive systems; synchronous programming",
}

@Article{Schlickling:2010:SAD,
  author =       "Marc Schlickling and Markus Pister",
  title =        "Semi-automatic derivation of timing models for {WCET}
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "67--76",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755888.1755899",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Embedded systems are widely used for supporting our
                 every day life. In the area of safety-critical systems
                 human life often depends on the system's correct
                 behavior. Many of such systems are hard real-time
                 systems, so that the notion of correctness not only
                 means functional correctness. They additionally have to
                 obey stringent timing constraints, i.e. timely task
                 completion under all circumstances is essential. An
                 example for such a safety-critical system is the flight
                 control computer in an airplane, which is responsible
                 for stability, attitude and path control.\par

                 In order to derive guarantees on the timing behavior of
                 hard real-time systems, the worst-case execution time
                 (WCET) of each task in the system has to be determined.
                 Saarland University and AbsInt GmbH have successfully
                 developed the aiT WCET analyzer for computing safe
                 upper bounds on the WCET of a task. The computation is
                 mainly based on abstract interpretation of timing
                 models of the processor and its periphery. Such timing
                 models are currently hand-crafted by human experts.
                 Therefore their implementation is a time-consuming and
                 error-prone process.\par

                 Modern processors or system controllers are
                 automatically synthesized out of formal hardware
                 specifications like VHDL or Verilog. Besides the
                 system' functional behavior, such specifications
                 provide all information needed for the creation of a
                 timing model. But due to their size and complexity,
                 manually examining the sources is even more complex
                 than only looking at the processor manuals. Moreover,
                 this would not reduce the effort nor the probability of
                 implementation errors.\par

                 To face this problem, this paper proposes a method for
                 semi-automatically deriving suitable timing models out
                 of formal hardware specifications in VHDL that fit to
                 the tool chain of the aiT WCET analyzer. By this, we
                 reduce the creation time of timing models from months
                 to weeks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "hard real-time; vhdl; worst-case execution time",
}

@Article{Viskic:2010:DEA,
  author =       "Ines Viskic and Lochi Yu and Daniel Gajski",
  title =        "Design exploration and automatic generation of {MPSoC}
                 platform {TLMs} from {Kahn Process Network}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "77--84",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755888.1755900",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With increasingly more complex Multi-Processor Systems
                 on Chip (MPSoC) and shortening time-to-market
                 projections, Transaction Level Modeling and Platform
                 Aware Design are seen as promising >approaches to
                 efficient MPSoC design.\par

                 In this paper, we present an automatized 3-phase
                 process of Platform Aware Design and apply it to Kahn
                 Process Networks (KPN) applications, a widely used
                 model of computation for data-flow applications. We
                 start with the KPN application and an abstract platform
                 template and automatically generate an executable TLM
                 with estimated timing that accurately reflects the
                 system platform. We support homogeneous and
                 heterogeneous multi-master platform models with shared
                 memory or direct communication paradigm. The
                 communication in heterogeneous platform modules is
                 enabled with the transducer unit (TX) for protocol
                 translation. TX units also act as message routers to
                 support Network on Chip (NoC) communication.\par

                 We evaluate our approach with the case study of the
                 H.264 Encoder design process, in which the
                 specification compliant design was reached from the KPN
                 application in less than 2 hours. The example
                 demonstrates that automatic generation of platform
                 aware TLMs enables a fast, efficient and error
                 resilient design process.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "automatic generation; Kahn Process Network; process
                 mapping; transaction level model",
}

@Article{Ozturk:2010:CDN,
  author =       "Ozcan Ozturk and Mahmut Kandemir and Mary J. Irwin and
                 Sri H. K. Narayanan",
  title =        "Compiler directed network-on-chip reliability
                 enhancement for chip multiprocessors",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "85--94",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755902",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Chip multiprocessors (CMPs) are expected to be the
                 building blocks for future computer systems. While
                 architecting these emerging CMPs is a challenging
                 problem on its own, programming them is even more
                 challenging. As the number of cores accommodated in
                 chip multiprocessors increases, network-on-chip (NoC)
                 type communication fabrics are expected to replace
                 traditional point-to-point buses. Most of the prior
                 software related work so far targeting CMPs focus on
                 performance and power aspects. However, as technology
                 scales, components of a CMP are being increasingly
                 exposed to both transient and permanent hardware
                 failures. This paper presents and evaluates a
                 compiler-directed power-performance aware reliability
                 enhancement scheme for network-on-chip (NoC) based chip
                 multiprocessors (CMPs). The proposed scheme improves
                 on-chip communication reliability by duplicating
                 messages traveling across CMP nodes such that, for each
                 original message, its duplicate uses a different set of
                 communication links as much as possible (to satisfy
                 performance constraint). In addition, our approach
                 tries to reuse communication links across the different
                 phases of the program to maximize link shutdown
                 opportunities for the NoC (to satisfy power
                 constraint). Our results show that the proposed
                 approach is very effective in improving on-chip network
                 reliability, without causing excessive power or
                 performance degradation. In our experiments, we also
                 evaluate the performance oriented and energy oriented
                 versions of our compiler-directed reliability
                 enhancement scheme, and compare it to two pure hardware
                 based fault tolerant routing schemes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "chip multiprocessors; compiler; noc; reliability",
}

@Article{Kulkarni:2010:IBP,
  author =       "Prasad A. Kulkarni and Michael R. Jantz and David B.
                 Whalley",
  title =        "Improving both the performance benefits and speed of
                 optimization phase sequence searches",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "95--104",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755888.1755903",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The issues of compiler optimization phase ordering and
                 selection present important challenges to compiler
                 developers in several domains, and in particular to the
                 speed, code size, power, and cost-constrained domain of
                 embedded systems. Different sequences of optimization
                 phases have been observed to provide the best
                 performance for different applications. Compiler
                 writers and embedded systems developers have recently
                 addressed this problem by conducting iterative
                 empirical searches using machine-learning based
                 heuristic algorithms in an attempt to find the phase
                 sequences that are most effective for each application.
                 Such searches are generally performed at the program
                 level, although a few studies have been performed at
                 the function level. The finer granularity of
                 function-level searches has the potential to provide
                 greater overall performance benefits, but only at the
                 cost of slower searches caused by a greater number of
                 performance evaluations that often require expensive
                 program simulations. In this paper, we evaluate the
                 performance benefits and search time increases of
                 function-level approaches as compared to their
                 program-level counterparts. We, then, present a novel
                 search algorithm that conducts distinct function-level
                 searches simultaneously, but requires only a single
                 program simulation for evaluating the performance of
                 potentially unique sequences for each function. Thus,
                 our new hybrid search strategy provides the enhanced
                 performance benefits of function-level searches with a
                 search-time cost that is comparable to or less than
                 program-level searches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "genetic algorithms; phase ordering",
}

@Article{Li:2010:ECU,
  author =       "Weijia Li and Youtao Zhang",
  title =        "An efficient code update scheme for {DSP} applications
                 in mobile embedded systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "105--114",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755904",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "DSP processors usually provide dedicated address
                 generation units (AGUs) to assist address computation.
                 By carefully allocating variables in the memory, DSP
                 compilers take advantage of AGUs and generate efficient
                 code with compact size and improved performance.
                 However, DSP applications running on mobile embedded
                 systems often need to be updated after their initial
                 releases. Studies showed that small changes at the
                 source code level may significantly change the variable
                 layout in the memory and thus the binary code, which
                 causes large energy overheads to mobile embedded
                 systems that patch through wireless or satellite
                 communication, and often pecuniary burden to the
                 users.\par

                 In this paper, we propose an update-conscious code
                 update scheme to effectively reduce patch size. It
                 first performs incremental offset assignment based on a
                 recent variable coalescing heuristic, and then
                 summarizes the code difference using two types of
                 update primitives. Our experimental results showed that
                 using update-conscious code update can greatly improve
                 code similarity and thus reduce the update script
                 sizes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "context-aware script; context-unaware script;
                 incremental coalescing general offset assignment
                 (icgoa); incremental coalescing simple offset
                 assignment (icsoa)",
}

@Article{Wernsing:2010:ECF,
  author =       "John Robert Wernsing and Greg Stitt",
  title =        "Elastic computing: a framework for transparent,
                 portable, and adaptive multi-core heterogeneous
                 computing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "115--124",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755906",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the past decade, system architectures have
                 started on a clear trend towards increased parallelism
                 and heterogeneity, often resulting in speedups of 10x
                 to 100x. Despite numerous compiler and high-level
                 synthesis studies, usage of such systems has largely
                 been limited to device experts, due to significantly
                 increased application design complexity. To reduce
                 application design complexity, we introduce elastic
                 computing - a framework that separates functionality
                 from implementation details by enabling designers to
                 use specialized functions, called elastic functions,
                 which enable an optimization framework to explore
                 thousands of possible implementations, even ones using
                 different algorithms. Elastic functions allow designers
                 to execute the same application code efficiently on
                 potentially any architecture and for different runtime
                 parameters such as input size, battery life, etc. In
                 this paper, we present an initial elastic computing
                 framework that transparently optimizes application code
                 onto diverse systems, achieving significant speedups
                 ranging from 1.3x to 46x on a hyper-threaded Xeon
                 system with an FPGA accelerator, a 16-CPU Opteron
                 system, and a quad-core Xeon system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "elastic computing; fpga; heterogeneous architectures;
                 multi-core; speedup",
}

@Article{Biehl:2010:ISA,
  author =       "Matthias Biehl and Chen DeJiu and Martin
                 T{\"o}rngren",
  title =        "Integrating safety analysis into the model-based
                 development toolchain of automotive embedded systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "125--132",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755907",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The automotive industry has a growing demand for the
                 seamless integration of safety analysis tools into the
                 model-based development toolchain for embedded systems.
                 This requires translating concepts of the automotive
                 domain to the safety domain. We automate such a
                 translation between the automotive architecture
                 description language EAST-ADL2 and the safety analysis
                 tool HiP-HOPS by using model transformations and by
                 leveraging the advantages of different model
                 transformation techniques. Through this integration,
                 the analysis can be conducted early in the development
                 process, when the system can be redesigned to fulfill
                 safety goals with relatively low effort and cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "architecture description language; model-based
                 development; safety analysis; tool integration",
}

@Article{Fischmeister:2010:SBP,
  author =       "Sebastian Fischmeister and Yanmeng Ba",
  title =        "Sampling-based program execution monitoring",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "133--142",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755908",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For its high overall cost during product development,
                 program debugging is an important aspect of system
                 development. Debugging is a hard and complex activity,
                 especially in time-sensitive systems which have limited
                 resources and demanding timing constraints. System
                 tracing is a frequently used technique for debugging
                 embedded systems. A specific use of system tracing is
                 to monitor and debug control-flow problems in programs.
                 However, it is difficult to implement because of the
                 potentially high overhead it might introduce to the
                 system and the changes which can occur to the system
                 behavior due to tracing. To solve the above problems,
                 in this work, we present a sampling-based approach to
                 execution monitoring which specifically helps
                 developers debug time-sensitive systems such as
                 real-time applications. We build the system model and
                 propose three theorems to determine the sampling period
                 in different scenarios. We also design seven heuristics
                 and an instrumentation framework to extend the sampling
                 period which can reduce the monitoring overhead and
                 achieve an optimal tradeoff between accuracy and
                 overhead introduced by instrumentation. Using this
                 monitoring framework, we can use the information
                 extracted through sampling to reconstruct the system
                 state and execution paths to locate the deviation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "debugging; embedded system; monitoring; sampling;
                 tracing",
}

@Article{Shrivastava:2010:CVE,
  author =       "Aviral Shrivastava and Jongeun Lee and Reiley
                 Jeyapaul",
  title =        "Cache vulnerability equations for protecting data in
                 embedded processor caches from soft errors",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "143--152",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755888.1755910",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Continuous technology scaling has brought us to a
                 point, where transistors have become extremely
                 susceptible to cosmic radiation strikes, or soft
                 errors. Inside the processor, caches are most
                 vulnerable to soft errors, and techniques at various
                 levels of design abstraction, e.g., fabrication, gate
                 design, circuit design, and microarchitecture-level,
                 have been developed to protect data in caches. However,
                 no work has been done to investigate the effect of code
                 transformations on the vulnerability of data in caches.
                 Data is vulnerable to soft errors in the cache only if
                 it will be read by the processor, and not if it will be
                 overwritten. Since code transformations can change the
                 read-write pattern of program variables, they
                 significantly effect the soft error vulnerability of
                 program variables in the cache. We observe that often
                 opportunity exists to significantly reduce the soft
                 error vulnerability of cache data by trading-off a
                 little performance. However, even if one wanted to
                 exploit this trade-off, it is difficult, since there
                 are no efficient techniques to estimate vulnerability
                 of data in caches. To this end, this paper develops
                 efficient static analysis method to estimate program
                 vulnerability in caches, which enables the compiler to
                 exploit the performance-vulnerability trade-offs in
                 applications. Finally, as compared to simulation based
                 estimation, static analysis techniques provide the
                 insights into vulnerability calculations that provide
                 some simple schemes to reduce program vulnerability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "cache vulnerability; code transformation; compiler
                 technique; embedded processors; soft errors; static
                 analysis",
}

@Article{Altmeyer:2010:RAT,
  author =       "Sebastian Altmeyer and Claire Maiza and Jan Reineke",
  title =        "Resilience analysis: tightening the {CRPD} bound for
                 set-associative caches",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "153--162",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755888.1755911",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In preemptive real-time systems, scheduling analyses
                 need --- in addition to the worst-case execution time
                 the context-switch cost. In case of preemption, the
                 preempted and the preempting task may interfere on the
                 cache memory.\par

                 This interference leads to additional cache misses in
                 the preempted task. The delay due to these cache misses
                 is referred to as the cache-related preemption
                 delay (CRPD), which constitutes the major part of the
                 context-switch cost.\par

                 In this paper, we present a new approach to compute
                 tight bounds on the CRPD for LRU set-associative
                 caches, based on analyses of both the preempted and the
                 preempting task. Previous approaches analyzing both the
                 preempted and the preempting task were either imprecise
                 or unsound.\par

                 As the basis of our approach we introduce the notion of
                 resilience: The resilience of a memory block of the
                 preempted task is the maximal number of memory accesses
                 a preempting task could perform without causing an
                 additional miss to this block. By computing lower
                 bounds on the resilience of blocks and an upper bound
                 on the number of accesses by a preempting task, one can
                 guarantee that some blocks may not contribute to the
                 CRPD. The CRPD analysis based on resilience
                 considerably outperforms previous approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "cache-related preemption delay; lru caches; timing
                 analysis",
}

@Article{Wang:2010:RRA,
  author =       "Yi Wang and Duo Liu and Meng Wang and Zhiwei Qin and
                 Zili Shao and Yong Guan",
  title =        "{RNFTL}: a reuse-aware {NAND} flash translation layer
                 for flash memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "4",
  pages =        "163--172",
  month =        apr,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1755951.1755912",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Apr 15 12:45:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we propose a hybrid-level flash
                 translation layer (FTL) called RNFTL (Reuse-Aware NFTL)
                 to improve the endurance and space utilization of NAND
                 flash memory. Our basic idea is to prevent a primary
                 block with many free pages from being erased in a merge
                 operation. The preserved primary blocks are further
                 reused as replacement blocks. In such a way, the space
                 utilization and the number of erase counts for each
                 block in NAND flash can be enhanced. To the best of our
                 knowledge, this is the first work to employ a
                 reuse-aware strategy in FTL for improving the space
                 utilization and endurance of NAND flash. We conduct
                 experiments on a set of traces that collected from real
                 workload in daily life. The experimental results show
                 that our technique has significant improvement on space
                 utilization, block lifetime and wear-leveling compared
                 with the previous work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "endurance; flash memory; reuse; space utilization;
                 wear-leveling",
}

@Article{Agerwala:2010:ECC,
  author =       "Tilak Agerwala",
  title =        "Exascale computing: the challenges and opportunities
                 in the next decade",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "1--2",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693454",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Supercomputing systems have made great strides in
                 recent years as the extensive computing needs of
                 cutting-edge engineering work and scientific discovery
                 have driven the development of more powerful systems.
                 In 2008, the first petaflop machine was released, and
                 historic trends indicate that in ten years, we should
                 be at the exascale level. Indeed, various agencies are
                 targeting a computer system capable of 1 Exaop (10**18
                 ops) of computation within the next decade. We believe
                 that applications in many industries will be materially
                 transformed by exascale computers.\par

                 Meeting the exascale challenge will require significant
                 innovation in technology, architecture and
                 programmability. Power is a fundamental problem at all
                 levels; traditional memory cost and performance are not
                 keeping pace with compute potential; the storage
                 hierarchy will have to be re-architected; networks will
                 be a much bigger part of the system cost; reliability
                 at exascale levels will require a holistic approach to
                 architecture design, and programmability and
                 ease-of-use will be an essential component to extract
                 the promised performance at the exascale level.\par

                 In this talk, I will discuss the importance of exascale
                 computing and address the major challenges, touching on
                 the areas of technology, architecture, reliability and
                 usability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "architecture; challenges; exascale",
}

@Article{Mendez-Lojo:2010:SDO,
  author =       "Mario M{\'e}ndez-Lojo and Donald Nguyen and Dimitrios
                 Prountzos and Xin Sui and M. Amber Hassaan and Milind
                 Kulkarni and Martin Burtscher and Keshav Pingali",
  title =        "Structure-driven optimizations for amorphous
                 data-parallel programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "3--14",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693457",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Irregular algorithms are organized around
                 pointer-based data structures such as graphs and trees,
                 and they are ubiquitous in applications. Recent work by
                 the Galois project has provided a systematic approach
                 for parallelizing irregular applications based on the
                 idea of optimistic or speculative execution of
                 programs. However, the overhead of optimistic parallel
                 execution can be substantial. In this paper, we show
                 that many irregular algorithms have structure that can
                 be exploited and present three key optimizations that
                 take advantage of algorithmic structure to reduce
                 speculative overheads. We describe the implementation
                 of these optimizations in the Galois system and present
                 experimental results to demonstrate their benefits. To
                 the best of our knowledge, this is the first system to
                 exploit algorithmic structure to optimize the execution
                 of irregular programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "amorphous data-parallelism; cautious operator
                 implementations; irregular programs; iteration
                 coalescing; one-shot optimization; optimistic
                 parallelization; synchronization overheads",
}

@Article{Coons:2010:GEU,
  author =       "Katherine E. Coons and Sebastian Burckhardt and
                 Madanlal Musuvathi",
  title =        "{GAMBIT}: effective unit testing for concurrency
                 libraries",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "15--24",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693458",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As concurrent programming becomes prevalent, software
                 providers are investing in concurrency libraries to
                 improve programmer productivity. Concurrency libraries
                 improve productivity by hiding error-prone, low-level
                 synchronization from programmers and providing
                 higher-level concurrent abstractions. Testing such
                 libraries is difficult, however, because concurrency
                 failures often manifest only under particular
                 scheduling circumstances. Current best testing
                 practices are often inadequate: heuristic-guided
                 fuzzing is not systematic, systematic schedule
                 enumeration does not find bugs quickly, and stress
                 testing is neither systematic nor fast.\par

                 To address these shortcomings, we propose a prioritized
                 search technique called GAMBIT that combines the speed
                 benefits of heuristic-guided fuzzing with the
                 soundness, progress, and reproducibility guarantees of
                 stateless model checking. GAMBIT combines known
                 techniques such as partial-order reduction and
                 preemption-bounding with a generalized best-first
                 search frame- work that prioritizes schedules likely to
                 expose bugs. We evaluate GAMBIT's effectiveness on
                 newly released concurrency libraries for Microsoft's
                 {.NET} framework. Our experiments show that GAMBIT
                 finds bugs more quickly than prior stateless model
                 checking techniques without compromising coverage
                 guarantees or reproducibility.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "concurrency; model checking; multithreading;
                 partial-order reduction; preemption bound; software
                 testing",
}

@Article{Lee:2010:FXC,
  author =       "Jonathan K. Lee and Jens Palsberg",
  title =        "Featherweight {X10}: a core calculus for async-finish
                 parallelism",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "25--36",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693459",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a core calculus with two of X10's key
                 constructs for parallelism, namely async and finish.
                 Our calculus forms a convenient basis for type systems
                 and static analyses for languages with async-finish
                 parallelism, and for tractable proofs of correctness.
                 For example, we give a short proof of the
                 deadlock-freedom theorem of Saraswat and Jagadeesan.
                 Our main contribution is a type system that solves the
                 open problem of context-sensitive
                 may-happen-in-parallel analysis for languages with
                 async-finish parallelism. We prove the correctness of
                 our type system and we report experimental results of
                 performing type inference on 13,000 lines of X10 code.
                 Our analysis runs in polynomial time, takes a total of
                 28 seconds on our benchmarks, and produces a low number
                 of false positives, which suggests that our analysis is
                 a good basis for other analyses such as race
                 detectors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "operational semantics; parallelism; static analysis",
}

@Article{Mannarswamy:2010:CAS,
  author =       "Sandya Mannarswamy and Dhruva R. Chakrabarti and
                 Kaushik Rajan and Sujoy Saraswati",
  title =        "Compiler aided selective lock assignment for improving
                 the performance of software transactional memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "37--46",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693460",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Atomic sections have been recently introduced as a
                 language construct to improve the programmability of
                 concurrent software. They simplify programming by not
                 requiring the explicit specification of locks for
                 shared data. Typically atomic sections are supported in
                 software either through the use of optimistic
                 concurrency by using transactional memory or through
                 the use of pessimistic concurrency using
                 compiler-assigned locks. As a software transactional
                 memory (STM) system does not take advantage of the
                 specific memory access patterns of an application it
                 often suffers from false conflicts and high validation
                 overheads. On the other hand, the compiler usually ends
                 up assigning coarse grain locks as it relies on whole
                 program points-to analysis which is conservative by
                 nature. This adversely affects performance by limiting
                 concurrency. In order to mitigate the disadvantages
                 associated with STM's lock assignment scheme, we
                 propose a hybrid approach which combines STM's lock
                 assignment with a compiler aided selective lock
                 assignment scheme (referred to as SCLA-STM). SCLA-STM
                 overcomes the inefficiencies associated with a purely
                 compile-time lock assignment approach by (i) using the
                 underlying STM for shared variables where only a
                 conservative analysis is possible by the compiler
                 (e.g., in the presence of may-alias points to
                 information) and (ii) being selective about the shared
                 data chosen for the compiler-aided lock assignment. We
                 describe our prototype SCLA-STM scheme implemented in
                 the HP-UX IA-64 C/C++ compiler, using TL2 as our STM
                 implementation. We show that SCLA-STM improves
                 application performance for certain STAMP benchmarks
                 from 1.68\% to 37.13\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compilers; multithreading; parallelization;
                 performance",
}

@Article{Rossbach:2010:TPA,
  author =       "Christopher J. Rossbach and Owen S. Hofmann and Emmett
                 Witchel",
  title =        "Is transactional programming actually easier?",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "47--56",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693462",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Chip multi-processors (CMPs) have become ubiquitous,
                 while tools that ease concurrent programming have not.
                 The promise of increased performance for all
                 applications through ever more parallel hardware
                 requires good tools for concurrent programming,
                 especially for average programmers. Transactional
                 memory (TM) has enjoyed recent interest as a tool that
                 can help programmers program concurrently.\par

                 The transactional memory (TM) research community is
                 heavily invested in the claim that programming with
                 transactional memory is easier than alternatives (like
                 locks), but evidence for or against the veracity of
                 this claim is scant. In this paper, we describe a
                 user-study in which 237 undergraduate students in an
                 operating systems course implement the same programs
                 using coarse and fine-grain locks, monitors, and
                 transactions. We surveyed the students after the
                 assignment, and examined their code to determine the
                 types and frequency of programming errors for each
                 synchronization technique. Inexperienced programmers
                 found baroque syntax a barrier to entry for
                 transactional programming. On average, subjective
                 evaluation showed that students found transactions
                 harder to use than coarse-grain locks, but slightly
                 easier to use than fine-grained locks. Detailed
                 examination of synchronization errors in the students'
                 code tells a rather different story. Overwhelmingly,
                 the number and types of programming errors the students
                 made was much lower for transactions than for locks. On
                 a similar programming problem, over 70\% of students
                 made errors with fine-grained locking, while less than
                 10\% made errors with transactions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "optimistic concurrency; synchronization; transactional
                 memory",
}

@Article{Zyulkyarov:2010:DPU,
  author =       "Ferad Zyulkyarov and Tim Harris and Osman S. Unsal and
                 Adr{\'\i}an Cristal and Mateo Valero",
  title =        "Debugging programs that use atomic blocks and
                 transactional memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "57--66",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693463",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the emergence of research prototypes, programming
                 using atomic blocks and transactional memory (TM) is
                 becoming more attractive. This paper describes our
                 experience building and using a debugger for programs
                 written with these abstractions. We introduce three
                 approaches: ({\em i\/}) debugging at the level of
                 atomic blocks, where the programmer is shielded from
                 implementation details (such as exactly what kind of TM
                 is used, or indeed whether lock inference is used
                 instead), ({\em ii\/}) debugging at the level of
                 transactions, where conflict rates, read sets, write
                 sets, and other TM internals are visible, and ({\em
                 iii\/}) debug-time transactions, which let the
                 programmer manipulate synchronization from within the
                 debugger - e.g., enlarging the scope of an atomic block
                 to try to identify a bug.\par

                 In this paper we explain the rationale behind the new
                 debugging approaches that we propose. We describe the
                 design and implementation of an extension to the WinDbg
                 debugger, enabling support for C\# programs using
                 atomic blocks and TM. We also demonstrate the design of
                 a 'conflict point discovery' technique for identifying
                 program statements that introduce contention between
                 transactions. We illustrate how these techniques can be
                 used by optimizing a C\# version of the Genome
                 application from STAMP TM benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "debugging; transactional memory",
}

@Article{Dalessandro:2010:NSS,
  author =       "Luke Dalessandro and Michael F. Spear and Michael L.
                 Scott",
  title =        "{NOrec}: streamlining {STM} by abolishing ownership
                 records",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "67--78",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693464",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Drawing inspiration from several previous projects, we
                 present an ownership-record-free software transactional
                 memory (STM) system that combines extremely low
                 overhead with unusually clean semantics. While unlikely
                 to scale to hundreds of active threads, this 'NOrec'
                 system offers many appealing features: very low
                 fast-path latency--as low as any system we know of that
                 admits concurrent updates; publication and
                 privatization safety; livelock freedom; a small,
                 constant amount of global metadata, and full
                 compatibility with existing data structure layouts; no
                 false conflicts due to hash collisions; compatibility
                 with both managed and unmanaged languages, and both
                 static and dynamic compilation; and easy accommodation
                 of closed nesting, inevitable (irrevocable)
                 transactions, and starvation avoidance mechanisms. To
                 the best of our knowledge, no extant STM system
                 combines this set of features.\par

                 While transactional memory for processors with hundreds
                 of cores is likely to require hardware support,
                 software implementations will be required for backward
                 compatibility with current and near-future processors
                 with 2--64 cores, as well as for fall-back in future
                 machines when hardware resources are exhausted. Our
                 experience suggests that NOrec may be an ideal
                 candidate for such a software system. We also observe
                 that it has considerable appeal for use within the
                 operating system, and in systems that require both
                 closed nesting and publication safety.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "ownership records; software transactional memory;
                 transactional memory; transactional memory models",
}

@Article{Maldonado:2010:SST,
  author =       "Walther Maldonado and Patrick Marlier and Pascal
                 Felber and Adi Suissa and Danny Hendler and Alexandra
                 Fedorova and Julia L. Lawall and Gilles Muller",
  title =        "Scheduling support for transactional memory contention
                 management",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "79--90",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693465",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactional Memory (TM) is considered as one of the
                 most promising paradigms for developing concurrent
                 applications. TM has been shown to scale well on
                 >multiple cores when the data access pattern behaves
                 'well,' i.e., when few conflicts are induced. In
                 contrast, data patterns with frequent write sharing,
                 with long transactions, or when many threads contend
                 for a smaller number of cores, result in numerous
                 conflicts. Until recently, TM implementations had
                 little control of transactional threads, which remained
                 under the supervision of the kernel's
                 transaction-ignorant scheduler. Conflicts are thus
                 traditionally resolved by consulting an STM-level {\em
                 contention manager}. Consequently, the contention
                 managers of these 'conventional' TM implementations
                 suffer from a lack of precision and often fail to
                 ensure reasonable performance in high-contention
                 workloads.\par

                 Recently, scheduling-based TM contention-management has
                 been proposed for increasing TM efficiency under
                 high-contention [2, 5, 19]. However, only user-level
                 schedulers have been considered. In this work, we
                 propose, implement and evaluate several novel
                 kernel-level scheduling support mechanisms for TM
                 contention management. We also investigate different
                 strategies for efficient communication between the
                 kernel and the user-level TM library. To the best of
                 our knowledge, our work is the first to investigate
                 kernel-level support for TM contention
                 management.\par

                 We have introduced kernel-level TM scheduling support
                 into both the Linux and Solaris kernels. Our
                 experimental evaluation demonstrates that lightweight
                 kernel-level scheduling support significantly reduces
                 the number of aborts while improving transaction
                 throughput on various workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "contention management; scheduling; transactional
                 memory",
}

@Article{Barreto:2010:LPN,
  author =       "Jo{\~a}o Barreto and Aleksandar Dragojevi{\'c} and
                 Paulo Ferreira and Rachid Guerraoui and Michal Kapalka",
  title =        "Leveraging parallel nesting in transactional memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "91--100",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693466",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Exploiting the emerging reality of affordable
                 multi-core architectures goes through providing
                 programmers with simple abstractions that would enable
                 them to easily turn their sequential programs into
                 concurrent ones that expose as much parallelism as
                 possible. While transactional memory promises to make
                 concurrent programming easy to a wide programmer
                 community, current implementations either disallow
                 nested transactions to run in parallel or do not scale
                 to arbitrary parallel nesting depths. This is an
                 important obstacle to the central goal of transactional
                 memory, as programmers can only start parallel threads
                 in restricted parts of their code.\par

                 This paper addresses the intrinsic difficulty behind
                 the support for parallel nesting in transactional
                 memory, and proposes a novel solution that, to the best
                 of our knowledge, is the first practical solution to
                 meet the lowest theoretical upper bound known for the
                 problem.\par

                 Using a synthetic workload configured to test parallel
                 transactions on a multi-core machine, a practical
                 implementation of our algorithm yields substantial
                 speed-ups (up to 22x with 33 threads) relatively to
                 serial nesting, and shows that the time to start and
                 commit transactions, as well as to detect conflicts, is
                 independent of nesting depth.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "fork-join; nested parallel programs; transactional
                 memory; work-stealing",
}

@Article{Torrellas:2010:ESC,
  author =       "Josep Torrellas and Bill Gropp and Jaime Moreno and
                 Kunle Olukotun and Vivek Sarkar",
  title =        "Extreme scale computing: challenges and
                 opportunities",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "101--102",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693468",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "architecture; challenges; exascale",
}

@Article{Arvind:2010:HI,
  author =       "Arvind",
  title =        "Is hardware innovation over?",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "103--104",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693455",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "My colleagues, promotion committees, research funding
                 agencies and business people often wonder if there is
                 need for any architecture research. There seems to be
                 no room to dislodge Intel IA-32. Even the number of new
                 Application-Specific Integrated Circuits (ASICs) seems
                 to be declining each year, because of the
                 ever-increasing development cost.\par

                 This viewpoint ignores another reality which is that
                 the future will be dominated by mobile devices such as
                 smart phones and the infrastructure needed to support
                 consumer services on these devices. This is already
                 restructuring the IT industry. To the first-order, in
                 the mobile world functionality is determined by what
                 can be supported within a 3W power budget. The only way
                 to reduce power by one to two orders of magnitude is
                 via functionally specialized hardware blocks. A
                 fundamental shift is needed in the current design flow
                 of systems-on-a-chip (SoCs) to produce them in a
                 less-risky and cost-effective manner.\par

                 In this talk we will present, via examples, a method of
                 designing systems that facilitates the synthesis of
                 complex SoCs from reusable 'IP' modules. The technical
                 challenge is to provide a method for connecting modules
                 in a parallel setting so that the functionality and the
                 performance of the composite are predictable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "hardware innovation; system-on-chip",
}

@Article{Baghsorkhi:2010:APM,
  author =       "Sara S. Baghsorkhi and Matthieu Delahaye and Sanjay J.
                 Patel and William D. Gropp and Wen-mei W. Hwu",
  title =        "An adaptive performance modeling tool for {GPU}
                 architectures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "105--114",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693470",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents an analytical model to predict the
                 performance of\par

                 general-purpose applications on a GPU architecture. The
                 model is designed to provide performance information to
                 an auto-tuning compiler and assist it in narrowing down
                 the search to the more promising implementations. It
                 can also be incorporated into a tool to help
                 programmers better assess the performance bottlenecks
                 in their code. We analyze each GPU kernel and identify
                 how the kernel exercises major GPU microarchitecture
                 features. To identify the performance bottlenecks
                 accurately, we introduce an abstract interpretation of
                 a GPU kernel, {\em work flow graph}, based on which we
                 estimate the execution time of a GPU kernel. We
                 validated our performance model on the NVIDIA GPUs
                 using CUDA (Compute Unified Device Architecture). For
                 this purpose, we used data parallel benchmarks that
                 stress different GPU microarchitecture events such as
                 uncoalesced memory accesses, scratch-pad memory bank
                 conflicts, and control flow divergence, which must be
                 accurately modeled but represent challenges to the
                 analytical performance models. The proposed model
                 captures full system complexity and shows high accuracy
                 in predicting the performance trends of different
                 optimized kernel implementations. We also describe our
                 approach to extracting the performance model
                 automatically from a kernel code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "analytical model; GPU; parallel programming;
                 performance estimation",
}

@Article{Choi:2010:MDA,
  author =       "Jee W. Choi and Amik Singh and Richard W. Vuduc",
  title =        "Model-driven autotuning of sparse matrix-vector
                 multiply on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "115--126",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693471",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a performance model-driven framework for
                 automated performance tuning (autotuning) of sparse
                 matrix-vector multiply (SpMV) on systems accelerated by
                 graphics processing units (GPU). Our study consists of
                 two parts.\par

                 First, we describe several carefully hand-tuned SpMV
                 implementations for GPUs, identifying key GPU-specific
                 performance limitations, enhancements, and tuning
                 opportunities. These implementations, which include
                 variants on classical blocked compressed sparse row
                 (BCSR) and blocked ELLPACK (BELLPACK) storage formats,
                 match or exceed state-of-the-art implementations. For
                 instance, our best BELLPACK implementation achieves up
                 to 29.0 Gflop/s in single-precision and 15.7 Gflop/s in
                 double-precision on the NVIDIA T10P multiprocessor
                 (C1060), enhancing prior state-of-the-art unblocked
                 implementations (Bell and Garland, 2009) by up to
                 1.8\times and 1.5\times for single-and double-precision
                 respectively.\par

                 However, achieving this level of performance requires
                 input matrix-dependent parameter tuning. Thus, in the
                 second part of this study, we develop a performance
                 model that can guide tuning. Like prior autotuning
                 models for CPUs (e.g., Im, Yelick, and Vuduc, 2004),
                 this model requires offline measurements and run-time
                 estimation, but more directly models the structure of
                 multithreaded vector processors like GPUs. We show that
                 our model can identify the implementations that achieve
                 within 15\% of those found through exhaustive search.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "GPU; performance modeling; sparse matrix-vector
                 multiplication",
}

@Article{Zhang:2010:FTS,
  author =       "Yao Zhang and Jonathan Cohen and John D. Owens",
  title =        "Fast tridiagonal solvers on the {GPU}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "127--136",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693472",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study the performance of three parallel algorithms
                 and their hybrid variants for solving tridiagonal
                 linear systems on a GPU: cyclic reduction (CR),
                 parallel cyclic reduction (PCR) and recursive doubling
                 (RD). We develop an approach to measure, analyze, and
                 optimize the performance of GPU programs in terms of
                 memory access, computation, and control overhead. We
                 find that CR enjoys linear algorithm complexity but
                 suffers from more algorithmic steps and bank conflicts,
                 while PCR and RD have fewer algorithmic steps but do
                 more work each step. To combine the benefits of the
                 basic algorithms, we propose hybrid CR+PCR and CR+RD
                 algorithms, which improve the performance of PCR, RD
                 and CR by 21\%, 31\% and 61\% respectively. Our GPU
                 solvers achieve up to a 28x speedup over a sequential
                 LAPACK solver, and a 12x speedup over a multi-threaded
                 CPU solver.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "GPGPU; performance optimization; tridiagonal linear
                 system",
}

@Article{Sandes:2010:CUG,
  author =       "Edans Flavius O. Sandes and Alba Cristina M. A. de
                 Melo",
  title =        "{CUDAlign}: using {GPU} to accelerate the comparison
                 of megabase genomic sequences",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "137--146",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693473",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Biological sequence comparison is a very important
                 operation in Bioinformatics. Even though there do exist
                 exact methods to compare biological sequences, these
                 methods are often neglected due to their quadratic time
                 and space complexity. In order to accelerate these
                 methods, many GPU algorithms were proposed in the
                 literature. Nevertheless, all of them restrict the size
                 of the smallest sequence in such a way that Megabase
                 genome comparison is prevented. In this paper, we
                 propose and evaluate CUDAlign, a GPU algorithm that is
                 able to compare Megabase biological sequences with an
                 exact Smith--Waterman affine gap variant. CUDAlign was
                 implemented in CUDA and tested in two GPU boards,
                 separately. For real sequences whose size range from
                 1MBP (Megabase Pairs) to 47MBP, a close to uniform
                 GCUPS (Giga Cells Updates per Second) was obtained,
                 showing the potential scalability of our approach.
                 Also, CUDAlign was able to compare the human chromosome
                 21 and the chimpanzee chromosome 22. This operation
                 took 21 hours on GeForce GTX 280, resulting in a peak
                 performance of 20.375 GCUPS. As far as we know, this is
                 the first time such huge chromosomes are compared with
                 an exact method.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "biological sequence comparison; GPU; Smith--Waterman",
}

@Article{Hofmeyr:2010:LBS,
  author =       "Steven Hofmeyr and Costin Iancu and Filip
                 Blagojevi{\'c}",
  title =        "Load balancing on speed",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "147--158",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693475",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To fully exploit multicore processors, applications
                 are expected to provide a large degree of thread-level
                 parallelism. While adequate for low core counts and
                 their typical workloads, the current load balancing
                 support in operating systems may not be able to achieve
                 efficient hardware utilization for parallel workloads.
                 Balancing run queue length globally ignores the needs
                 of parallel applications where threads are required to
                 make equal progress. In this paper we present a load
                 balancing technique designed specifically for parallel
                 applications running on multicore systems. Instead of
                 balancing run queue length, our algorithm balances the
                 time a thread has executed on ``faster'' and ``slower''
                 cores. We provide a user level implementation of speed
                 balancing on UMA and NUMA multi-socket architectures
                 running Linux and discuss behavior across a variety of
                 workloads, usage scenarios and programming models. Our
                 results indicate that speed balancing when compared to
                 the native Linux load balancing improves performance
                 and provides good performance isolation in all cases
                 considered. Speed balancing is also able to provide
                 comparable or better performance than DWRR, a fair
                 multi-processor scheduling implementation inside the
                 Linux kernel. Furthermore, parallel application
                 performance is often determined by the implementation
                 of synchronization operations and speed balancing
                 alleviates the need for tuning the implementations of
                 such primitives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "load balancing; operating systems; parallel
                 applications",
}

@Article{Hoefler:2010:SCP,
  author =       "Torsten Hoefler and Christian Siebert and Andrew
                 Lumsdaine",
  title =        "Scalable communication protocols for dynamic sparse
                 data exchange",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "159--168",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693476",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many large-scale parallel programs follow a bulk
                 synchronous parallel (BSP) structure with distinct
                 computation and communication phases. Although the
                 communication phase in such programs may involve all
                 (or large numbers) of the participating processes, the
                 actual communication operations are usually sparse in
                 nature. As a result, communication phases are typically
                 expressed explicitly using point-to-point communication
                 operations or collective operations. We define the
                 dynamic sparse data-exchange (DSDE) problem and derive
                 bounds in the well known LogGP model. While current
                 approaches work well with static applications, they run
                 into limitations as modern applications grow in scale,
                 and as the problems that are being solved become
                 increasingly irregular and dynamic.\par

                 To enable the compact and efficient expression of the
                 communication phase, we develop suitable sparse
                 communication protocols for irregular applications at
                 large scale. We discuss different irregular
                 applications and show the sparsity in the communication
                 for real-world input data. We discuss the time and
                 memory complexity of commonly used protocols for the
                 DSDE problem and develop {\em NBX\/} --a novel fast
                 algorithm with constant memory overhead for solving it.
                 Algorithm {\em NBX\/} improves the runtime of a sparse
                 data-exchange among 8,192 processors on BlueGene/P by a
                 factor of 5.6. In an application study, we show
                 improvements of up to a factor of 28.9 for a parallel
                 breadth first search on 8,192 BlueGene/P processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "alltoall; distributed termination; irregular
                 algorithms; nonblocking collective operations; sparse
                 data exchange",
}

@Article{Romein:2010:LCI,
  author =       "John W. Romein and P. Chris Broekema and Jan David Mol
                 and Rob V. van Nieuwpoort",
  title =        "The {LOFAR} correlator: implementation and performance
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "169--178",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693477",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "LOFAR is the first of a new generation of radio
                 telescopes. Rather than using expensive dishes, it
                 forms a distributed sensor network that combines the
                 signals from many thousands of simple antennas. Its
                 revolutionary design allows observations in a frequency
                 range that has hardly been studied before.\par

                 Another novel feature of LOFAR is the elaborate use of
                 {\em software\/} to process data, where traditional
                 telescopes use customized hardware. This dramatically
                 increases flexibility and substantially reduces costs,
                 but the high processing and bandwidth requirements
                 compel the use of a supercomputer. The antenna signals
                 are centrally combined, filtered, optionally
                 beam-formed, and correlated by an IBM Blue
                 Gene/P.\par

                 This paper describes the implementation of the
                 so-called correlator. To meet the real-time
                 requirements, the application is highly optimized, and
                 reaches exceptionally high computational and I/O
                 efficiencies. Additionally, we study the scalability of
                 the system, and show that it scales well beyond the
                 requirements. The optimizations allows us to use only
                 half the planned amount of resources, {\em and\/}
                 process 50\% more telescope data, significantly
                 improving the effectiveness of the entire telescope.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "correlator; IBM Blue Gene/P; LOFAR",
}

@Article{Tzannes:2010:LBS,
  author =       "Alexandros Tzannes and George C. Caragea and Rajeev
                 Barua and Uzi Vishkin",
  title =        "Lazy binary-splitting: a run-time adaptive
                 work-stealing scheduler",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "179--190",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693479",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Lazy Binary Splitting (LBS), a user-level
                 scheduler of nested parallelism for shared-memory
                 multiprocessors that builds on existing Eager Binary
                 Splitting work-stealing (EBS) implemented in Intel's
                 Threading Building Blocks (TBB), but improves
                 performance and ease-of-programming. In its simplest
                 form (SP), EBS requires manual tuning by repeatedly
                 running the application under carefully controlled
                 conditions to determine a {\em stop-splitting-threshold
                 (sst)\/} for every do-all loop in the code. This
                 threshold limits the parallelism and prevents excessive
                 overheads for fine-grain parallelism. Besides being
                 tedious, this tuning also over-fits the code to some
                 particular dataset, platform and calling context of the
                 do-all loop, resulting in poor performance portability
                 for the code. LBS overcomes both the performance
                 portability and ease-of-programming pitfalls of a
                 manually fixed threshold by adapting dynamically to
                 run-time conditions without requiring tuning.\par

                 We compare LBS to Auto-Partitioner (AP), the latest
                 default scheduler of TBB, which does not require manual
                 tuning either but lacks context portability, and
                 outperform it by 38.9\% using TBB's default AP
                 configuration, and by 16.2\% after we tuned AP to our
                 experimental platform. We also compare LBS to SP by
                 manually finding SP's sst using a training dataset and
                 then running both on a different execution dataset. LBS
                 outperforms SP by 19.5\% on average. while allowing for
                 improved performance portability without requiring
                 tedious manual tuning. LBS also outperforms SP with
                 {\em sst=1}, its default value when undefined, by
                 56.7\%, and serializing work-stealing (SWS), another
                 work-stealer by 54.7\%. Finally, compared to
                 serializing inner parallelism (SI) which has been used
                 by OpenMP, LBS is 54.2\% faster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic scheduling; load balancing; nested
                 parallelism; thread scheduling; work stealing",
}

@Article{Radojkovic:2010:TSB,
  author =       "Petar Radojkovi{\'c} and Vladimir {\v{C}}akarevi{\'c}
                 and Javier Verd{\'u} and Alex Pajuelo and Francisco
                 J. Cazorla and Mario Nemirovsky and Mateo Valero",
  title =        "Thread to strand binding of parallel network
                 applications in massive multi-threaded systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "191--202",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693480",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In processors with several levels of hardware resource
                 sharing,like CMPs in which each core is an SMT, the
                 scheduling process becomes more complex than in
                 processors with a single level of resource sharing,
                 such as pure-SMT or pure-CMP processors. Once the
                 operating system selects the set of applications to
                 simultaneously schedule on the processor (workload),
                 each application/thread must be assigned to one of the
                 hardware contexts(strands). We call this last
                 scheduling step the Thread to Strand Binding or TSB. In
                 this paper, we show that the TSB impact on the
                 performance of processors with several levels of shared
                 resources is high. We measure a variation of up to 59\%
                 between different TSBs of real multithreaded network
                 applications running on the UltraSPARC T2 processor
                 which has three levels of resource sharing. In our
                 view, this problem is going to be more acute in future
                 multithreaded architectures comprising more cores, more
                 contexts per core, and more levels of resource
                 sharing.\par

                 We propose a resource-sharing aware TSB algorithm
                 (TSBSched) that significantly facilitates the problem
                 of thread to strand binding for software-pipelined
                 applications, representative of multithreaded network
                 applications. Our systematic approach encapsulates
                 both, the characteristics of multithreaded processors
                 under the study and the structure of the software
                 pipelined applications. Once calibrated for a given
                 processor architecture, our proposal does not require
                 hardware knowledge on the side of the programmer, nor
                 extensive profiling of the application. We validate our
                 algorithm on the UltraSPARC T2 processor running a set
                 of real multithreaded network applications on which we
                 report improvements of up to 46\% compared to the
                 current state-of-the-art dynamic schedulers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "CMT; process scheduling; simultaneous multithreading;
                 UltraSPARC T2",
}

@Article{Zhang:2010:DCS,
  author =       "Eddy Z. Zhang and Yunlian Jiang and Xipeng Shen",
  title =        "Does cache sharing on modern {CMP} matter to the
                 performance of contemporary multithreaded programs?",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "203--212",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693482",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most modern Chip Multiprocessors (CMP) feature shared
                 cache on chip. For multithreaded applications, the
                 sharing reduces communication latency among co-running
                 threads, but also results in cache contention.\par

                 A number of studies have examined the influence of
                 cache sharing on multithreaded applications, but most
                 of them have concentrated on the design or management
                 of shared cache, rather than a systematic measurement
                 of the influence. Consequently, prior measurements have
                 been constrained by the reliance on simulators, the use
                 of out-of-date benchmarks, and the limited coverage of
                 deciding factors. The influence of CMP cache sharing on
                 contemporary multithreaded applications remains
                 preliminarily understood.\par

                 In this work, we conduct a systematic measurement of
                 the influence on two kinds of commodity CMP machines,
                 using a recently released CMP benchmark suite, PARSEC,
                 with a number of potentially important factors on
                 program, OS, and architecture levels considered. The
                 measurement shows some surprising results. Contrary to
                 commonly perceived importance of cache sharing, neither
                 positive nor negative effects from the cache sharing
                 are significant for most of the program executions,
                 regardless of the types of parallelism, input datasets,
                 architectures, numbers of threads, and assignments of
                 threads to cores. After a detailed analysis, we find
                 that the main reason is the mismatch of current
                 development and compilation of multithreaded
                 applications and CMP architectures. By transforming the
                 programs in a cache-sharing-aware manner, we observe up
                 to 36\% performance increase when the threads are
                 placed on cores appropriately.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "chip multiprocessors; parallel program optimizations;
                 shared cache; thread scheduling",
}

@Article{Liu:2010:IPL,
  author =       "Lixia Liu and Zhiyuan Li",
  title =        "Improving parallelism and locality with asynchronous
                 algorithms",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "213--222",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693483",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As multicore chips become the main building blocks for
                 high performance computers, many numerical applications
                 face a performance impediment due to the limited
                 hardware capacity to move data between the CPU and the
                 off-chip memory. This is especially true for large
                 computing problems solved by iterative algorithms
                 because of the large data set typically used. Loop
                 tiling, also known as loop blocking, was shown
                 previously to be an effective way to enhance data
                 locality, and hence to reduce the memory bandwidth
                 pressure, for a class of iterative algorithms executed
                 on a single processor. Unfortunately, the tiled
                 programs suffer from reduced parallelism because only
                 the loop iterations within a single tile can be easily
                 parallelized. In this work, we propose to use the
                 asynchronous model to enable effective loop tiling such
                 that both parallelism and locality can be attained
                 simultaneously. Asynchronous algorithms were previously
                 proposed to reduce the communication cost and
                 synchronization overhead between processors. Our new
                 discovery is that carefully controlled asynchrony and
                 loop tiling can significantly improve the performance
                 of parallel iterative algorithms on multicore
                 processors due to simultaneously attained data locality
                 and loop-level parallelism. We present supporting
                 evidence from experiments with three well-known
                 numerical kernels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "asynchronous algorithms; data locality; loop tiling;
                 memory performance; parallel numerical programs",
}

@Article{Castaldo:2010:SLP,
  author =       "Anthony M. Castaldo and R. Clint Whaley",
  title =        "Scaling {LAPACK} panel operations using parallel cache
                 assignment",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "223--232",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693484",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In LAPACK many matrix operations are cast as block
                 algorithms which iteratively process a panel using an
                 unblocked algorithm and then update a remainder matrix
                 using the high performance Level 3 BLAS. The Level 3
                 BLAS have excellent weak scaling, but panel processing
                 tends to be bus bound, and thus scales with bus speed
                 rather than the number of processors ({\em p\/}).
                 Amdahl's law therefore ensures that as {\em p\/} grows,
                 the panel computation will become the dominant cost of
                 these LAPACK routines. Our contribution is a novel
                 parallel cache assignment approach which we show scales
                 well with {\em p}. We apply this general approach to
                 the QR and LU panel factorizations on two commodity
                 8-core platforms with very different cache structures,
                 and demonstrate superlinear panel factorization
                 speedups on both machines. Other approaches to this
                 problem demand complicated reformulations of the
                 computational approach, new kernels to be tuned, new
                 mathematics, an inflation of the high-order flop count,
                 and do not perform as well. By demonstrating a
                 straight-forward alternative that avoids all of these
                 contortions and scales with {\em p}, we address a
                 critical stumbling block for dense linear algebra in
                 the age of massive parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "Atlas; factorization; GPU; LAPACK; LU; multicore;
                 multi-core; parallel; QR",
}

@Article{Sutherland:2010:CTC,
  author =       "Dean F. Sutherland and William L. Scherlis",
  title =        "Composable thread coloring",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "233--244",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693485",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces the language-independent concept
                 of ``thread usage policy.'' Many multi-threaded
                 software systems contain policies that regulate
                 associations among threads, executable code, and
                 potentially shared state. A system, for example, may
                 constrain which threads are permitted to execute
                 particular code segments, usually as a means to
                 constrain those threads from accessing or writing
                 particular elements of state. These policies ensure
                 properties such as state confinement or reader/writer
                 constraints, often without recourse to locking or
                 transaction discipline.\par

                 Our approach allows developers to concisely document
                 their thread usage policies in a manner that enables
                 the use of sound scalable analysis to assess
                 consistency of policy and as-written code. This paper
                 identifies the key semantic concepts of our thread
                 coloring language and illustrates how to use its
                 succinct source-level annotations to express models of
                 thread usage policies, following established annotation
                 conventions for Java.\par

                 We have built a prototype static analysis tool,
                 implemented as an integrated development environment
                 plug-in (for the Eclipse IDE), that notifies developers
                 of discrepancies between policy annotations and
                 as-written code. Our analysis technique uses several
                 underlying algorithms based on abstract interpretation,
                 call-graphs, and type inference. The resulting overall
                 analysis is both sound and composable. We have used
                 this prototype analysis tool in case studies to model
                 and analyze more than a million lines of code.\par

                 Our validation process included field trials on a wide
                 variety of complex large-scale production code selected
                 by the host organizations. Our in-field experience led
                 us to focus on potential adoptability by real-world
                 developers. We have developed techniques that can
                 reduce annotation density to less than one line per
                 thousand lines of code (KLOC). In addition, the
                 prototype analysis tool supports an incremental and
                 iterative approach to modeling and analysis. This
                 approach enabled field trial partners to directly
                 target areas of greatest concern and to achieve useful
                 results within a few hours.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "annotation; Java; keywords: state consistency;
                 multicore; race conditions; state confinement; thread
                 policy",
}

@Article{Agrawal:2010:HLF,
  author =       "Kunal Agrawal and Charles E. Leiserson and Jim
                 Sukha",
  title =        "Helper locks for fork-join parallel programming",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "245--256",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693487",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Helper locks allow programs with large parallel
                 critical sections, called parallel regions, to execute
                 more efficiently by enlisting processors that might
                 otherwise be waiting on the helper lock to aid in the
                 execution of the parallel region. Suppose that a
                 processor {\em p\/} is executing a parallel region {\em
                 A\/} after having acquired the lock {\em L\/}
                 protecting {\em A}. If another processor {\em p\/}
                 $\prime$ tries to acquire {\em L}, then instead of
                 blocking and waiting for {\em p\/} to complete {\em A},
                 processor {\em p\/} $\prime$ joins {\em p\/} to help it
                 complete {\em A}. Additional processors not blocked on
                 {\em L\/} may also help to execute {\em A}.\par

                 The HELPER runtime system can execute fork-join
                 computations augmented with helper locks and parallel
                 regions. HELPER supports the unbounded nesting of
                 parallel regions. We provide theoretical
                 completion-time and space-usage bounds for a design of
                 HELPER based on work stealing. Specifically, let {\em
                 V\/} be the number of parallel regions in a
                 computation, let {\em T\/}$_1$ be its work, and let
                 {\em T\/} $\infty$ be its `aggregate span' --- the sum
                 of the spans (critical-path lengths) of all its
                 parallel regions. We prove that HELPER completes the
                 computation in expected time {\em O\/} ({\em T\/}$_1$ /
                 {\em P\/} P + {\em T\/} $\infty$+ {\em PV\/}) on {\em
                 P\/} processors. This bound indicates that programs
                 with a small number of highly parallel critical
                 sections can attain linear speedup. For the space
                 bound, we prove that HELPER completes a program using
                 only $O(P S_1)$ stack space, where $S_1$ is the sum,
                 over all regions, of the stack space used by each
                 region in a serial execution. Finally, we describe a
                 prototype of HELPER implemented by modifying the Cilk
                 multithreaded runtime system. We used this prototype to
                 implement a concurrent hash table with a resize
                 operation protected by a helper lock.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "Cilk; dynamic multithreading; helper lock; nested
                 parallelism; parallel region; scheduling; work
                 stealing",
}

@Article{Bronson:2010:PCB,
  author =       "Nathan G. Bronson and Jared Casper and Hassan Chafi
                 and Kunle Olukotun",
  title =        "A practical concurrent binary search tree",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "257--268",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693488",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a concurrent relaxed balance AVL tree
                 algorithm that is fast, scales well, and tolerates
                 contention. It is based on optimistic techniques
                 adapted from software transactional memory, but takes
                 advantage of specific knowledge of the algorithm to
                 reduce overheads and avoid unnecessary retries. We
                 extend our algorithm with a fast linearizable clone
                 operation, which can be used for consistent iteration
                 of the tree. Experimental evidence shows that our
                 algorithm outperforms a highly tuned concurrent skip
                 list for many access patterns, with an average of 39\%
                 higher single-threaded throughput and 32\% higher
                 multi-threaded throughput over a range of contention
                 levels and operation mixes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "optimistic concurrency; snapshot isolation",
}

@Article{Tallent:2010:ALC,
  author =       "Nathan R. Tallent and John M. Mellor-Crummey and Allan
                 Porterfield",
  title =        "Analyzing lock contention in multithreaded
                 applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "269--280",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693489",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many programs exploit shared-memory parallelism using
                 multithreading. Threaded codes typically use locks to
                 coordinate access to shared data. In many cases,
                 contention for locks reduces parallel efficiency and
                 hurts scalability. Being able to quantify and attribute
                 lock contention is important for understanding where a
                 multithreaded program needs improvement.\par

                 This paper proposes and evaluates three strategies for
                 gaining insight into performance losses due to lock
                 contention. First, we consider using a straightforward
                 strategy based on call stack profiling to attribute
                 idle time and show that it fails to yield insight into
                 lock contention. Second, we consider an approach that
                 builds on a strategy previously used for analyzing
                 idleness in work-stealing computations; we show that
                 this strategy does not yield insight into lock
                 contention. Finally, we propose a new technique for
                 measurement and analysis of lock contention that uses
                 data associated with locks to blame lock holders for
                 the idleness of spinning threads. Our approach incurs
                 $\leq$ 5\% overhead on a quantum chemistry application
                 that makes extensive use of locking (65M distinct
                 locks, a maximum of 340K live locks, and an average of
                 30K lock acquisitions per second per thread) and
                 attributes lock contention to its full static and
                 dynamic calling contexts. Our strategy, implemented in
                 HPCToolkit, is fully distributed and should scale well
                 to systems with large core counts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "HPCToolkit; lock contention; multithreading;
                 performance analysis",
}

@Article{Upadhyaya:2010:UDS,
  author =       "Gautam Upadhyaya and Samuel P. Midkiff and Vijay S.
                 Pai",
  title =        "Using data structure knowledge for efficient lock
                 generation and strong atomicity",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "281--292",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693490",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To achieve high-performance on multicore systems,
                 sharedmemory parallel languages must efficiently
                 implement atomic operations. The commonly used and
                 studied paradigms for atomicity are fine-grained
                 locking, which is both difficult to program and
                 error-prone; optimistic software transactions, which
                 require substantial overhead to detect and recover from
                 atomicity violations; and compiler-generation of locks
                 from programmer-specified atomic sections, which leads
                 to serialization whenever imprecise pointer analysis
                 suggests the mere possibility of a conflicting
                 operation. This paper presents a new strategy for
                 compiler-generated locking that uses data structure
                 knowledge to facilitate more precise alias and lock
                 generation analyses and reduce unnecessary
                 serialization. Implementing and evaluating these ideas
                 in the Java language shows that the new strategy
                 achieves eight-thread speedups of 0.83 to 5.9 for the
                 five STAMP benchmarks studied, outperforming software
                 transactions on all but one benchmark, and nearly
                 matching programmer-specified fine-grained locks on all
                 but one benchmark. The results also indicate that
                 compiler knowledge of data structures improves the
                 effectiveness of compiler analysis, boosting
                 eight-thread performance by up to 300\%. Further, the
                 new analysis allows for software support of strong
                 atomicity with less than 1\% overhead for two
                 benchmarks and less than 20\% for three others. The
                 strategy also nearly matches the performance of
                 programmer-specified fine-grained locks for the
                 SPECjbb2000 benchmark, which has traditionally not been
                 amenable to static analyses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "automatic lock generation; parallel programming;
                 transactional memory",
}

@Article{Ali:2010:MAC,
  author =       "Qasim Ali and Samuel Pratt Midkiff and Vijay S. Pai",
  title =        "Modeling advanced collective communication algorithms
                 on {Cell}-based systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "293--304",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693492",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents and validates performance models
                 for a variety of high-performance collective
                 communication algorithms for systems with Cell
                 processors. The systems modeled include a single Cell
                 processor, two Cell chips on a Cell Blade, and a
                 cluster of Cell Blades. The models extend PLogP, the
                 well-known point-to-point performance model, by
                 accounting for the unique hardware characteristics of
                 the Cell (e.g., heterogeneous interconnects and DMA
                 engines) and by applying the model to collective
                 communication. This paper also presents a
                 micro-benchmark suite to accurately measure the
                 extended PLogP parameters on the Cell Blade and then
                 uses these parameters to model different algorithms for
                 the {\em barrier, broadcast, reduce, all-reduce}, and
                 {\em all-gather\/} collective operations. Out of 425
                 total performance predictions, 398 of them see less
                 than 10\% error compared to the actual execution time
                 and all of them see less than 15\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "algorithms; collective communication; modeling",
}

@Article{Zhai:2010:PPP,
  author =       "Jidong Zhai and Wenguang Chen and Weimin Zheng",
  title =        "{PHANTOM}: predicting performance of parallel
                 applications on large-scale parallel machines using a
                 single node",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "305--314",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693493",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For designers of large-scale parallel computers, it is
                 greatly desired that performance of parallel
                 applications can be predicted at the design phase.
                 However, this is difficult because the execution time
                 of parallel applications is determined by several
                 factors, including sequential computation time in each
                 process, communication time and their convolution.
                 Despite previous efforts, it remains an open problem to
                 estimate sequential computation time in each process
                 accurately and efficiently for large-scale parallel
                 applications on non-existing target machines.\par

                 This paper proposes a novel approach to predict the
                 sequential computation time accurately and efficiently.
                 We assume that there is at least one node of the target
                 platform but the whole target system need not be
                 available. We make two main technical contributions.
                 First, we employ deterministic replay techniques to
                 execute any process of a parallel application on a
                 single node at real speed. As a result, we can simply
                 measure the real sequential computation time on a
                 target node for each process one by one. Second, we
                 observe that computation behavior of processes in
                 parallel applications can be clustered into a few
                 groups while processes in each group have similar
                 computation behavior. This observation helps us reduce
                 measurement time significantly because we only need to
                 execute representative parallel processes instead of
                 all of them.\par

                 We have implemented a performance prediction framework,
                 called PHANTOM, which integrates the above
                 computation-time acquisition approach with a
                 trace-driven network simulator. We validate our
                 approach on several platforms. For ASCI Sweep3D, the
                 error of our approach is less than 5\% on 1024
                 processor cores. Compared to a recent regression-based
                 prediction approach, PHANTOM presents better prediction
                 accuracy across different platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "deterministic replay; parallel application;
                 performance prediction; trace-driven simulation",
}

@Article{Aleen:2010:IDD,
  author =       "Farhana Aleen and Monirul Sharif and Santosh Pande",
  title =        "Input-driven dynamic execution prediction of streaming
                 applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "315--324",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693494",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Streaming applications are promising targets for
                 effectively utilizing multicores because of their
                 inherent amenability to pipelined parallelism. While
                 existing methods of orchestrating streaming programs on
                 multicores have mostly been static, real-world
                 applications show ample variations in execution time
                 that may cause the achieved speedup and throughput to
                 be sub-optimal. One of the principle challenges for
                 moving towards dynamic orchestration has been the lack
                 of approaches that can predict or accurately estimate
                 upcoming dynamic variations in execution efficiently,
                 well before they occur.\par

                 In this paper, we propose an automated dynamic
                 execution behavior prediction approach that can be used
                 to efficiently estimate the time that will be spent in
                 different pipeline stages for upcoming inputs without
                 requiring program execution. This enables dynamic
                 balancing or scheduling of execution to achieve better
                 speedup. Our approach first uses dynamic taint analysis
                 to automatically generates an input-based execution
                 characterization of the streaming program, which
                 identifies the key control points where variation in
                 execution might occur with the associated input
                 elements that cause these variations. We then
                 automatically generate a light-weight emulator from the
                 program using this characterization that can simulate
                 the execution paths taken for new streaming inputs and
                 provide an estimate of execution time that will be
                 spent in processing these inputs, enabling prediction
                 of possible dynamic variations. We present experimental
                 evidence that our technique can accurately and
                 efficiently estimate execution behaviors for several
                 benchmarks. Our experiments show that dynamic
                 orchestration using our predicted execution behavior
                 can achieve considerably higher speedup than static
                 orchestration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic execution; parallelization; software
                 pipeline",
}

@Article{Lupei:2010:TST,
  author =       "Daniel Lupei and Bogdan Simion and Don Pinto and
                 Matthew Misler and Mihai Burcea and William Krick and
                 Cristiana Amza",
  title =        "Towards scalable and transparent parallelization of
                 multiplayer games using transactional memory support",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "325--326",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693496",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This work addresses the problem of parallelizing
                 multiplayer games using {\em software\/} Transactional
                 Memory (STM) support. Using a realistic high impact
                 application, we show that STM provides not only ease of
                 programming, but also {\em better\/} performance than
                 that achievable with state-of-the-art lock-based
                 programming.\par

                 Towards this goal, we use SynQuake, a game benchmark
                 which extracts the main data structures and the
                 essential features of the popular multiplayer game
                 Quake, but can be driven with a synthetic workload
                 generator that flexibly emulates client game actions
                 and various hot-spot scenarios in the game
                 world.\par

                 We implement, evaluate and compare the STM version of
                 SynQuake with a state-of-the-art lock-based
                 parallelization of Quake, which we ported to SynQuake.
                 While in STM-SynQuake support for maintaining the
                 consistency of each potentially complex game action is
                 automatic, conservative locking of surrounding objects
                 within a bounding box for the duration of the game
                 action is inherently needed in lock-based SynQuake.
                 This leads to a higher scalability factor of
                 STM-SynQuake versus lock-based SynQuake, due to a
                 higher degree of false sharing in the latter.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "massively multiplayer games; scalability; software
                 transactional memory; synchronization",
}

@Article{Perarnau:2010:KRC,
  author =       "Swann Perarnau and Guillaume Huard",
  title =        "{KRASH}: reproducible {CPU} load generation on many
                 cores machines",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "327--328",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693497",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this article we present KRASH, a tool for
                 reproducible generation of system-level CPU load. This
                 tool is intended for use in shared memory machines
                 equipped with multiple CPU cores which are usually
                 exploited concurrently by several users. The objective
                 of KRASH is to enable parallel application developers
                 to validate their resources use strategies on a
                 partially loaded machine by {\em replaying\/} an
                 observed load in concurrence with their application. To
                 reach this objective, we present a method for CPU load
                 generation which behaves as realistically as possible:
                 the resulting load is similar to the load that would be
                 produced by concurrent processes run by other users.
                 Nevertheless, contrary to a simple run of a
                 CPU-intensive application, KRASH is not sensitive to
                 system scheduling decisions. The main benefit brought
                 by KRASH is this reproducibility: no matter how many
                 processes are present in the system the load generated
                 by our tool strictly respects a given load profile. To
                 our knowledge, KRASH is the only tool that implements
                 the generation of a dynamic load profile (a load
                 varying with time). When used to generate a constant
                 load, KRASH result is among the most realistic ones.
                 Furthermore, KRASH provides more flexibility than other
                 tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "CPU load generation; experimentation testbed; many
                 cores",
}

@Article{Muralidhara:2010:IAS,
  author =       "Sai Prashanth Muralidhara and Mahmut Kandemir and
                 Padma Raghavan",
  title =        "Intra-application shared cache partitioning for
                 multithreaded applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "329--330",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693498",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we address the problem of partitioning
                 a shared cache when the executing threads belong to the
                 same application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "cache; multicore; parallel applications",
}

@Article{Dash:2010:SPT,
  author =       "Alokika Dash and Brian Demsky",
  title =        "Symbolic prefetching in transactional distributed
                 shared memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "331--332",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693499",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a static analysis for the automatic
                 generation of symbolic prefetches in a transactional
                 distributed shared memory. A symbolic prefetch
                 specifies the first object to be prefetched followed by
                 a list of field offsets or array indices that define a
                 path through the heap. We also provide an object
                 caching framework and language extensions to support
                 our approach. To our knowledge, this is the first
                 prefetching approach that can prefetch objects whose
                 addresses have not been computed or predicted.\par

                 Our approach makes aggressive use of both prefetching
                 and caching of remote objects to hide network latency.
                 It relies on the transaction commit mechanism to
                 preserve the simple transactional consistency model
                 that we present to the developer. We have evaluated
                 this approach on several shared memory parallel
                 benchmarks and a distributed gaming benchmark to
                 observe speedups due to prefetching and caching.
                 Categories and Subject Descriptors",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "distributed shared memory; symbolic prefetching;
                 transactional memory",
}

@Article{Chakrabarti:2010:NAE,
  author =       "Dhruva R. Chakrabarti",
  title =        "New abstractions for effective performance analysis of
                 {STM} programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "333--334",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693500",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the design and implementation of a dynamic
                 conflict graph annotated with fine grain transaction
                 characteristics and show that this is important
                 information for effective performance analysis of a
                 software transactional memory (STM) program. We show
                 how to implement the necessary support in a compiler
                 and an STM with minimal perturbation of the original
                 behavior of the application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "concurrency; software transactional memory",
}

@Article{Zhang:2010:CSP,
  author =       "Chao Zhang and Chen Ding and Xiaoming Gu and Kirk
                 Kelsey and Tongxin Bai and Xiaobing Feng",
  title =        "Continuous speculative program parallelization in
                 software",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "335--336",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693501",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper addresses the problem of extracting
                 coarse-grained parallelism from large sequential code.
                 It builds on BOP, a system for software speculative
                 parallelization. BOP lets a user to mark possibly
                 parallel regions (PPR) in a program and at run-time
                 speculatively executes PPR instances using Unix
                 processes. This short paper presents a new run-time
                 support called continuous speculation, which fully
                 utilizes available parallelism to tolerate differences
                 in PPR task size and processor speed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "software speculative parallelization",
}

@Article{Marjanovic:2010:ECC,
  author =       "Vladimir Marjanovic and Jes{\'u}s Labarta and Eduard
                 Ayguad{\'e} and Mateo Valero",
  title =        "Effective communication and computation overlap with
                 hybrid {MPI\slash SMPSs}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "337--338",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693502",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Communication overhead is one of the dominant factors
                 affecting performance in high-performance computing
                 systems. To reduce the negative impact of
                 communication, programmers overlap communication and
                 computation by using asynchronous communication
                 primitives. This increases code complexity, requiring
                 more development effort and making less readable
                 programs. This paper presents the hybrid use of MPI and
                 SMPSs (SMP superscalar, a task-based shared-memory
                 programming model) that allows the programmer to easily
                 introduce the asynchrony necessary to overlap
                 communication and computation. We demonstrate the
                 hybrid use of MPI/SMPSs with the high-performance
                 LINPACK benchmark (HPL), and compare it to the pure MPI
                 implementation, which uses the look-ahead technique to
                 overlap communication and computation. The hybrid
                 MPI/SMPSs version significantly improves the
                 performance of the pure MPI version, getting close to
                 the asymptotic performance at medium problem sizes and
                 still getting significant benefits at small/large
                 problem sizes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "hybrid MPI/SMPSs; LINPACK; MPI; parallel programming
                 model",
}

@Article{Cederman:2010:SLF,
  author =       "Daniel Cederman and Philippas Tsigas",
  title =        "Supporting lock-free composition of concurrent data
                 objects",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "339--340",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1693453.1693503",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Lock-free data objects offer several advantages over
                 their blocking counterparts, such as being immune to
                 deadlocks and convoying and, more importantly, being
                 highly concurrent. But they share a common disadvantage
                 in that the operations they provide are difficult to
                 compose into larger atomic operations while still
                 guaranteeing lock-freedom. We present a lock-free
                 methodology for composing highly concurrent
                 linearizable objects together by unifying their
                 linearization points. This makes it possible to
                 relatively easily introduce atomic lock-free move
                 operations to a wide range of concurrent objects.
                 Experimental evaluation has shown that the operations
                 originally supported by the data objects keep their
                 performance behavior under our methodology.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "composition; data structures; lock-free",
}

@Article{Guo:2010:SSL,
  author =       "Yi Guo and Jisheng Zhao and Vincent Cave and Vivek
                 Sarkar",
  title =        "{SLAW}: a scalable locality-aware adaptive
                 work-stealing scheduler for multi-core systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "341--342",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693504",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This poster introduces SLAW, a Scalable Locality-aware
                 Adaptive Work-stealing scheduler. The SLAW features an
                 adaptive task scheduling algorithm combined with a
                 locality-aware scheduling framework.\par

                 Past work has demonstrated the pros and cons of using
                 fixed scheduling policies, such as {\em work-first\/}
                 and {\em help-first}, in different cases without a
                 clear winner. Prior work also assumes the availability
                 and successful execution of a serial version of the
                 parallel program. This assumption can limit the
                 expressiveness of dynamic task parallel
                 languages.\par

                 The SLAW scheduler supports both work-first and
                 help-first policies simultaneously. It does so by using
                 an {\em adaptive\/} approach that selects a scheduling
                 policy on a per-task basis at runtime. The SLAW
                 scheduler also establishes bounds on the stack usage
                 and the heap space needed to store tasks. The
                 experimental results for the benchmarks studied show
                 that SLAW's adaptive scheduler achieves 0.98x - 9.2x
                 speedup over the help-first scheduler and 0.97x - 4.5x
                 speedup over the work-first scheduler for 64-thread
                 executions, thereby establishing the robustness of
                 using an adaptive approach instead of a fixed policy.
                 In contrast, the help-first policy is 9.2x slower than
                 work-first in the worst case for a fixed help-first
                 policy, and the work-first policy is 3.7x slower than
                 help-first in the worst case for a fixed work-first
                 policy. Further, for large irregular recursive parallel
                 computations, the adaptive scheduler runs with bounded
                 stack usage and achieves performance (and supports data
                 sizes) that cannot be delivered by the use of any
                 single fixed policy.\par

                 The SLAW scheduler is designed for programming models
                 where locality hints are provided to the runtime by the
                 programmer or compiler, and achieves {\em
                 locality-awareness\/} by grouping workers into {\em
                 places}. Locality awareness can lead to improved
                 performance by increasing temporal data reuse within a
                 worker and among workers in the same place. Our
                 experimental results show that locality-aware
                 scheduling can achieve up to 2.6x speedup over
                 locality-oblivious scheduling, for the benchmarks
                 studied.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "help-first; work-first; work-stealing",
}

@Article{Yang:2010:OCG,
  author =       "Yi Yang and Ping Xiang and Jingfei Kong and Huiyang
                 Zhou",
  title =        "An optimizing compiler for {GPGPU} programs with
                 input-data sharing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "343--344",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693505",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Developing high performance GPGPU programs is
                 challenging for application developers since the
                 performance is dependent upon how well the code
                 leverages the hardware features of specific graphics
                 processors. To solve this problem and relieve
                 application developers of low-level hardware-specific
                 optimizations, we introduce a novel compiler to
                 optimize GPGPU programs. Our compiler takes a naive GPU
                 kernel function, which is functionally correct but
                 without any consideration for performance optimization.
                 The compiler then analyzes the code, identifies memory
                 access patterns, and generates optimized code. The
                 proposed compiler optimizations target at one category
                 of scientific and media processing algorithms, which
                 has the characteristics of input-data sharing when
                 computing neighboring output pixels/elements. Many
                 commonly used algorithms, such as matrix
                 multiplication, convolution, etc., share such
                 characteristics. For these algorithms, novel approaches
                 are proposed to enforce memory coalescing and achieve
                 effective data reuse. Data prefetching and
                 hardware-specific tuning are also performed
                 automatically with our compiler framework. The
                 experimental results based on a set of applications
                 show that our compiler achieves very high performance,
                 either superior or very close to the highly fine-tuned
                 library, NVIDIA CUBLAS 2.1.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compiler; GPGPU",
}

@Article{Chandramowlishwaran:2010:ACC,
  author =       "Aparna Chandramowlishwaran and Kathleen Knobe and
                 Richard Vuduc",
  title =        "Applying the concurrent collections programming model
                 to asynchronous parallel dense linear algebra",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "345--346",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693506",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This poster is a case study on the application of a
                 novel programming model, called Concurrent Collections
                 (CnC), to the implementation of an
                 asynchronous-parallel algorithm for computing the
                 Cholesky factorization of dense matrices. In CnC, the
                 programmer expresses her computation in terms of
                 application-specific operations, partially-ordered by
                 semantic scheduling constraints. We demonstrate the
                 performance potential of CnC in this poster, by showing
                 that our Cholesky implementation nearly matches or
                 exceeds competing vendor-tuned codes and alternative
                 programming models. We conclude that the CnC model is
                 well-suited for expressing asynchronous-parallel
                 algorithms on emerging multicore systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "asynchronous algorithms; concurrent collections; dense
                 linear algebra",
}

@Article{Hoffmann:2010:AHS,
  author =       "Henry Hoffmann and Jonathan Eastep and Marco D.
                 Santambrogio and Jason E. Miller and Anant Agarwal",
  title =        "Application heartbeats for software performance and
                 health",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "347--348",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693507",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Adaptive, or self-aware, computing has been proposed
                 to help application programmers confront the growing
                 complexity of multicore software development. However,
                 existing approaches to adaptive systems are largely ad
                 hoc and often do not manage to incorporate the true
                 performance goals of the applications they are designed
                 to support. This paper presents an enabling technology
                 for adaptive computing systems: Application Heartbeats.
                 The Application Heartbeats framework provides a simple,
                 standard programming interface that applications can
                 use to indicate their performance and system software
                 (and hardware) can use to query an application's
                 performance. The PARSEC benchmark suite is instrumented
                 with Application Heartbeats to show the broad
                 applicability of the interface and an external resource
                 scheduler demonstrates the use of the interface by
                 assigning cores to an application to maintain a
                 designated performance goal.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "adaptive algorithms",
}

@Article{Porter:2010:MTM,
  author =       "Donald E. Porter and Emmett Witchel",
  title =        "Modeling transactional memory workload performance",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "349--350",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693508",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactional memory promises to make parallel
                 programming easier than with fine-grained locking,
                 while performing just as well. This performance claim
                 is not always borne out because an application may
                 violate a common-case assumption of the TM designer or
                 because of external system effects. In order to help
                 programmers assess the suitability of their code for
                 transactional memory, this work introduces a formal
                 model of transactional memory as well as a tool, called
                 Syncchar. Syncchar can predict the speedup of a
                 conversion from locks to transactions within 25\% for
                 the STAMP benchmarks. Because getting good performance
                 from transactions is more difficult than commonly
                 appreciated, developers need tools to tune
                 transactional performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "performance; Syncchar; transactional memory",
}

@Article{Carter:2010:PLN,
  author =       "John D. Carter and William B. Gardner and Gary
                 Grewal",
  title =        "The {Pilot} library for novice {MPI} programmers",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "351--352",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693509",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Pilot library is a new method for programming
                 MPI-enabled clusters in C, targeted at novice parallel
                 programmers. Formal elements from Communicating
                 Sequential Processes (CSP) are used to realize a
                 process/channel model of parallel computation that
                 reduces opportunities for deadlock and other
                 communication errors. This simple model, plus an
                 application programming inter-face (API) styled after
                 C's formatted I/O, are designed to make the library
                 easy to learn. The Pilot library exists as a thin layer
                 on top of any standard Message Passing Interface (MPI)
                 implementation, preserving MPI's portability and
                 efficiency, with little performance overhead arising as
                 result of Pilot's additional features.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "C; cluster programming; collective operations;
                 deadlock detection; high-performance computing; MPI",
}

@Article{Jang:2010:DTE,
  author =       "Byunghyun Jang and Perhaad Mistry and Dana Schaa and
                 Rodrigo Dominguez and David Kaeli",
  title =        "Data transformations enabling loop vectorization on
                 multithreaded data parallel architectures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "353--354",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Loop vectorization, a key feature exploited to obtain
                 high performance on Single Instruction Multiple Data
                 (SIMD) vector architectures, is significantly hindered
                 by irregular memory access patterns in the data stream.
                 This paper describes data transformations that allow us
                 to vectorize loops targeting massively multithreaded
                 data parallel architectures. We present a mathematical
                 model that captures loop-based memory access patterns
                 and computes the most appropriate data transformations
                 in order to enable vectorization. Our experimental
                 results show that the proposed data transformations can
                 significantly increase the number of loops that can be
                 vectorized and enhance the data-level parallelism of
                 applications. Our results also show that the overhead
                 associated with our data transformations can be easily
                 amortized as the size of the input data set increases.
                 For the set of high performance benchmark kernels
                 studied, we achieve consistent and significant
                 performance improvements (up to 11.4X) by applying
                 vectorization using our data transformation approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "data transformation; GPGPU; loop vectorization",
}

@Article{Buehrer:2010:DPS,
  author =       "Gregory Buehrer and Srinivasan Parthasarathy and
                 Shirish Tatikonda",
  title =        "A distributed placement service for graph-structured
                 and tree-structured data",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "355--356",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693511",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Effective data placement strategies can enhance the
                 performance of data-intensive applications implemented
                 on high end computing clusters. Such strategies can
                 have a significant impact in localizing the
                 computation, in minimizing synchronization
                 (communication) costs, in enhancing reliability (via
                 strategic replication policies), and in ensuring a
                 balanced workload or enhancing the available bandwidth
                 from massive storage devices (e.g. disk
                 arrays).\par

                 Existing work has largely targeted the placement of
                 relatively simple data types or entities (e.g.
                 elements, vectors, sets, and arrays). Here we
                 investigate several hash-based distributed data
                 placement methods targeting tree- and graph- structured
                 data, and develop a locality enhancing placement
                 service for large cluster systems. Target applications
                 include the placement of a single large graph (e.g. Web
                 graph), a single large tree (e.g. large XML file), a
                 forest of graphs or trees (e.g. XML database) and other
                 specialized graph data types - bi-partite (query-click
                 graphs), directed acyclic graphs etc. We empirically
                 evaluate our service by demonstrating its use in
                 improving mining executions for pattern discovery,
                 nearest neighbor searching, graph computations, and
                 applications that combine link and content analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "data placement; distributed computing; structured
                 data",
}

@Article{Li:2010:SVC,
  author =       "Guodong Li and Ganesh Gopalakrishnan and Robert M.
                 Kirby and Dan Quinlan",
  title =        "A symbolic verifier for {CUDA} programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "5",
  pages =        "357--358",
  month =        may,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837853.1693512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Aug 31 22:39:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a preliminary automated verifier based on
                 mechanical decision procedures which is able to prove
                 functional correctness of CUDA programs and guarantee
                 to detect bugs such as race conditions. We also employ
                 a symbolic partial order reduction (POR) technique to
                 mitigate the interleaving explosion problem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "cuda; formal verification; SPMD; symbolic analysis",
}

@Article{Richards:2010:ADB,
  author =       "Gregor Richards and Sylvain Lebresne and Brian Burg
                 and Jan Vitek",
  title =        "An analysis of the dynamic behavior of {JavaScript}
                 programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "1--12",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806598",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The JavaScript programming language is widely used for
                 web programming and, increasingly, for general purpose
                 computing. As such, improving the correctness, security
                 and performance of JavaScript applications has been the
                 driving force for research in type systems, static
                 analysis and compiler techniques for this language.
                 Many of these techniques aim to reign in some of the
                 most dynamic features of the language, yet little seems
                 to be known about how programmers actually utilize the
                 language or these features. In this paper we perform an
                 empirical study of the dynamic behavior of a corpus of
                 widely-used JavaScript programs, and analyze how and
                 why the dynamic features are used. We report on the
                 degree of dynamism that is exhibited by these
                 JavaScript programs and compare that with assumptions
                 commonly made in the literature and accepted industry
                 benchmark suites.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic behavior; dynamic metrics; execution tracing;
                 javascript; program analysis",
}

@Article{Bond:2010:BEC,
  author =       "Michael D. Bond and Graham Z. Baker and Samuel Z.
                 Guyer",
  title =        "{Breadcrumbs}: efficient context sensitivity for
                 dynamic bug detection analyses",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "13--24",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806599",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Calling context--the set of active methods on the
                 stack--is critical for understanding the dynamic
                 behavior of large programs. Dynamic program analysis
                 tools, however, are almost exclusively context
                 insensitive because of the prohibitive cost of
                 representing calling contexts at run time. Deployable
                 dynamic analyses, in particular, have been limited to
                 reporting only static program locations.\par

                 This paper presents Breadcrumbs, an efficient technique
                 for recording and reporting dynamic calling contexts.
                 It builds on an existing technique for computing a
                 compact (one word) encoding of each calling context
                 that client analyses can use in place of a program
                 location. The key feature of our system is a search
                 algorithm that can reconstruct a calling context from
                 its encoding using only a static call graph and a small
                 amount of dynamic information collected at cold
                 (infrequently executed) callsites. Breadcrumbs requires
                 no offline training or program modifications, and
                 handles all language features, including dynamic class
                 loading.\par

                 We use Breadcrumbs to add context sensitivity to two
                 dynamic analyses: a data-race detector and an analysis
                 for diagnosing null pointer exceptions. On average, it
                 adds 10\% to 20\% runtime overhead, depending on a
                 tunable parameter that controls how much dynamic
                 information is collected. Collecting less information
                 lowers the overhead, but can result in a search space
                 explosion. In some cases this causes reconstruction to
                 fail, but in most cases Breadcrumbs >produces
                 non-trivial calling contexts that have the potential to
                 significantly improve both the precision of the
                 analyses and the quality of the bug reports.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "bug detection; context sensitivity; dynamic analysis",
}

@Article{Ruwase:2010:DLE,
  author =       "Olatunji Ruwase and Shimin Chen and Phillip B. Gibbons
                 and Todd C. Mowry",
  title =        "Decoupled lifeguards: enabling path optimizations for
                 dynamic correctness checking tools",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "25--35",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806600",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic correctness checking tools (a.k.a. lifeguards)
                 can detect a wide array of correctness issues, such as
                 memory, security, and concurrency misbehavior, in
                 unmodified executables at run time. However, lifeguards
                 that are implemented using dynamic binary
                 instrumentation (DBI) often slow down the monitored
                 application by 10-50X, while proposals that replace DBI
                 with hardware still see 3-8X slowdowns. The remaining
                 overhead is the cost of performing the lifeguard
                 analysis itself. In this paper, we explore compiler
                 optimization techniques to reduce this
                 overhead.\par

                 The lifeguard software is typically structured as a set
                 of event-driven handlers, where the events are
                 individual instructions in the monitored application's
                 dynamic instruction stream. We propose to {\em
                 decouple\/} the lifeguard checking code from the
                 application that it is monitoring so that the lifeguard
                 analysis can be invoked at the granularity of {\em hot
                 paths\/} in the monitored application. In this way, we
                 are able to find many more opportunities for
                 eliminating redundant work in the lifeguard analysis,
                 even starting with well-optimized applications and
                 hand-tuned lifeguard handlers. Experimental results
                 with two lifeguard frameworks - one DBI-based and one
                 hardware-assisted - show significant reduction in
                 monitoring overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic code optimization; dynamic correctness
                 checking; dynamic program analysis",
}

@Article{Lee:2010:JSD,
  author =       "Byeongcheol Lee and Ben Wiedermann and Martin Hirzel
                 and Robert Grimm and Kathryn S. McKinley",
  title =        "{Jinn}: synthesizing dynamic bug detectors for foreign
                 language interfaces",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "36--49",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806601",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming language specifications mandate static and
                 dynamic analyses to preclude syntactic and semantic
                 errors. Although individual languages are usually
                 well-specified, composing languages is not, and this
                 poor specification is a source of many errors in {\em
                 multilingual\/} programs. For example, virtually all
                 Java programs compose Java and C using the Java Native
                 Interface (JNI). Since JNI is informally specified,
                 developers have difficulty using it correctly, and
                 current Java compilers and virtual machines (VMs)
                 inconsistently check only a subset of JNI
                 constraints.\par

                 This paper's most significant contribution is to show
                 how to synthesize dynamic analyses from state machines
                 to detect foreign function interface (FFI) violations.
                 We identify three classes of FFI constraints encoded by
                 eleven state machines that capture thousands of JNI and
                 Python/C FFI rules. We use a mapping function to
                 specify which state machines, transitions, and program
                 entities (threads, objects, references) to check at
                 each FFI call and return. From this function, we
                 synthesize a context-specific dynamic analysis to find
                 FFI bugs. We build bug detection tools for JNI and
                 Python/C using this approach. For JNI, we dynamically
                 and transparently interpose the analysis on Java and C
                 language transitions through the JVM tools interface.
                 The resulting tool, called Jinn, is compiler and
                 virtual machine {\em independent}. It detects and
                 diagnoses a wide variety of FFI bugs that other tools
                 miss. This approach greatly reduces the annotation
                 burden by exploiting common FFI constraints: whereas
                 the generated Jinn code is 22,000+ lines, we wrote only
                 1,400 lines of state machine and mapping code. Overall,
                 this paper lays the foundation for a more principled
                 approach to developing correct multilingual software
                 and a more concise and automated approach to FFI
                 specification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic analysis; ffi bugs; foreign function
                 interfaces (FFI); java native interface (jni);
                 multilingual programs; python/C; specification;
                 specification generation",
}

@Article{Prabhu:2010:SPS,
  author =       "Prakash Prabhu and Ganesan Ramalingam and Kapil
                 Vaswani",
  title =        "Safe programmable speculative parallelism",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "50--61",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806603",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Execution order constraints imposed by dependences can
                 serialize computation, preventing parallelization of
                 code and algorithms. Speculating on the value(s)
                 carried by dependences is one way to break such
                 critical dependences. Value speculation has been used
                 effectively at a low level, by compilers and hardware.
                 In this paper, we focus on the use of speculation {\em
                 by programmers\/} as an algorithmic paradigm to
                 parallelize seemingly sequential code.\par

                 We propose two new language constructs, {\em
                 speculative composition\/} and {\em speculative
                 iteration}. These constructs enable programmers to
                 declaratively express speculative parallelism in
                 programs: to indicate when and how to speculate,
                 increasing the parallelism in the program, without
                 concerning themselves with mundane implementation
                 details.\par

                 We present a core language with speculation constructs
                 and mutable state and present a formal operational
                 semantics for the language. We use the semantics to
                 define the notion of a correct speculative execution as
                 one that is equivalent to a non-speculative execution.
                 In general, speculation requires a runtime mechanism to
                 undo the effects of speculative computation in the case
                 of mis predictions. We describe a set of conditions
                 under which such rollback can be avoided. We present a
                 static analysis that checks if a given program
                 satisfies these conditions. This allows us to implement
                 speculation efficiently, without the overhead required
                 for rollbacks.\par

                 We have implemented the speculation constructs as a C\#
                 library, along with the static checker for safety. We
                 present an empirical evaluation of the efficacy of this
                 approach to parallelization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "purity; rollback freedom; safety; speculative
                 parallelism; value speculation",
}

@Article{Tian:2010:SSP,
  author =       "Chen Tian and Min Feng and Rajiv Gupta",
  title =        "Supporting speculative parallelization in the presence
                 of dynamic data structures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "62--73",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806604",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The availability of multicore processors has led to
                 significant interest in compiler techniques for
                 speculative parallelization of sequential programs.
                 Isolation of speculative state from non-speculative
                 state forms the basis of such speculative techniques as
                 this separation enables recovery from misspeculations.
                 In our prior work on CorD [35,36] we showed that for
                 array and scalar variable based programs copying of
                 data between speculative and non-speculative memory can
                 be highly optimized to support state separation that
                 yields significant speedups on multicore machines
                 available today. However, we observe that in context of
                 heap-intensive programs that operate on linked dynamic
                 data structures, state separation based speculative
                 parallelization poses many challenges. The copying of
                 data structures from non-speculative to speculative
                 state (copy-in operation) can be very expensive due to
                 the large sizes of dynamic data structures. The copying
                 of updated data structures from speculative state to
                 non-speculative state (copy-out operation) is made
                 complex due to the changes in the shape and size of the
                 dynamic data structure made by the speculative
                 computation. In addition, we must contend with the need
                 to translate pointers internal to dynamic data
                 structures between their non-speculative and
                 speculative memory addresses. In this paper we develop
                 an augmented design for the representation of dynamic
                 data structures such that all of the above operations
                 can be performed efficiently. Our experiments
                 demonstrate significant speedups on a real machine for
                 a set of programs that make extensive use of heap based
                 dynamic data structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "multicore processors; speculative parallelization",
}

@Article{Kandemir:2010:CTA,
  author =       "Mahmut Kandemir and Taylan Yemliha and SaiPrashanth
                 Muralidhara and Shekhar Srikantaiah and Mary Jane Irwin
                 and Yuanrui Zhnag",
  title =        "Cache topology aware computation mapping for
                 multicores",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "74--85",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806605",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The main contribution of this paper is a compiler
                 based, cache topology aware code optimization scheme
                 for emerging multicore systems. This scheme distributes
                 the iterations of a loop to be executed in parallel
                 across the cores of a target multicore machine and
                 schedules the iterations assigned to each core. Our
                 goal is to improve the utilization of the on-chip
                 multi-layer cache hierarchy and to maximize overall
                 application performance. We evaluate our cache topology
                 aware approach using a set of twelve applications and
                 three different commercial multicore machines. In
                 addition, to study some of our experimental parameters
                 in detail and to explore future multicore machines
                 (with higher core counts and deeper on-chip cache
                 hierarchies), we also conduct a simulation based study.
                 The results collected from our experiments with three
                 Intel multicore machines show that the proposed
                 compiler-based approach is very effective in enhancing
                 performance. In addition, our simulation results
                 indicate that optimizing for the on-chip cache
                 hierarchy will be even more important in future
                 multicores with increasing numbers of cores and cache
                 levels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "cache; compiler; multicore; multi-level;
                 topology-aware",
}

@Article{Yang:2010:GCM,
  author =       "Yi Yang and Ping Xiang and Jingfei Kong and Huiyang
                 Zhou",
  title =        "A {GPGPU} compiler for memory optimization and
                 parallelism management",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "86--97",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806606",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a novel optimizing compiler for
                 general purpose computation on graphics processing
                 units (GPGPU). It addresses two major challenges of
                 developing high performance GPGPU programs: effective
                 utilization of GPU memory hierarchy and judicious
                 management of parallelism.\par

                 The input to our compiler is a na&\#239;ve GPU kernel
                 function, which is functionally correct but without any
                 consideration for performance optimization. The
                 compiler analyzes the code, identifies its memory
                 access patterns, and generates both the optimized
                 kernel and the kernel invocation parameters. Our
                 optimization process includes vectorization and memory
                 coalescing for memory bandwidth enhancement, tiling and
                 unrolling for data reuse and parallelism management,
                 and thread block remapping or address-offset insertion
                 for partition-camping elimination. The experiments on a
                 set of scientific and media processing algorithms show
                 that our optimized code achieves very high performance,
                 either superior or very close to the highly fine-tuned
                 library, NVIDIA CUBLAS 2.2, and up to 128 times
                 speedups over the naive versions. Another
                 distinguishing feature of our compiler is the
                 understandability of the optimized code, which is
                 useful for performance analysis and algorithm
                 refinement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compiler; gpgpu",
}

@Article{Eggers:2010:AL,
  author =       "Susan Eggers",
  title =        "{2010 Athena} lecture",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "98--98",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806608",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Susan Eggers, a Professor of Computer Science and
                 Engineering at the University of Washington, joined her
                 department in 1989. She received a B.A. in 1965 from
                 Connecticut College and a Ph. D. in 1989 from the
                 University of California, Berkeley. Her research
                 interests are in computer architecture and back-end
                 compiler optimization, with an emphasis on experimental
                 performance analysis. With her colleague Hank Levy and
                 their students, she developed the first commercially
                 viable multithreaded architecture, Simultaneous
                 Multithreading, adopted by Intel (as Hyperthreading),
                 IBM, Sun and others. Her current research is in the
                 areas of distributed dataflow machines, FPGAs and chip
                 multiprocessors. In 1989 Professor Eggers was awarded
                 an IBM Faculty Development Award, in 1990 an NSF
                 Presidential Young Investigator Award, in 1994 the
                 Microsoft Professorship in Computer Science and
                 Engineering, and in 2009 the ACM-W Athena Lecturer. She
                 is a Fellow of the ACM and IEEE, a Fellow of the AAAS,
                 and a member of the National Academy of Engineering.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "invited talk",
}

@Article{Yang:2010:SLI,
  author =       "Jean Yang and Chris Hawblitzel",
  title =        "Safe to the last instruction: automated verification
                 of a type-safe operating system",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "99--110",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806610",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Typed assembly language (TAL) and Hoare logic can
                 verify the absence of many kinds of errors in low-level
                 code. We use TAL and Hoare logic to achieve highly
                 automated, static verification of the safety of a new
                 operating system called Verve. Our techniques and tools
                 mechanically verify the safety of every assembly
                 language instruction in the operating system, run-time
                 system, drivers, and applications (in fact, every part
                 of the system software except the boot loader). Verve
                 consists of a 'Nucleus' that provides primitive access
                 to hardware and memory, a kernel that builds services
                 on top of the Nucleus, and applications that run on top
                 of the kernel. The Nucleus, written in verified
                 assembly language, implements allocation, garbage
                 collection, multiple stacks, interrupt handling, and
                 device access. The kernel, written in C\# and compiled
                 to TAL, builds higher-level services, such as
                 preemptive threads, on top of the Nucleus. A TAL
                 checker verifies the safety of the kernel and
                 applications. A Hoare-style verifier with an automated
                 theorem prover verifies both the safety and correctness
                 of the Nucleus. Verve is, to the best of our knowledge,
                 the first operating system mechanically verified to
                 guarantee both type and memory safety. More generally,
                 Verve's approach demonstrates a practical way to mix
                 high-level typed code with low-level untyped code in a
                 verifiably safe manner.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "operating system; run-time system; type safety;
                 verification",
}

@Article{Tatlock:2010:BEV,
  author =       "Zachary Tatlock and Sorin Lerner",
  title =        "Bringing extensibility to verified compilers",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "111--121",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806611",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Verified compilers, such as Leroy's CompCert, are
                 accompanied by a fully checked correctness proof. Both
                 the compiler and proof are often constructed with an
                 interactive proof assistant. This technique provides a
                 strong, end-to-end correctness guarantee on top of a
                 small trusted computing base. Unfortunately, these
                 compilers are also challenging to extend since each
                 additional transformation must be proven correct in
                 full formal detail.\par

                 At the other end of the spectrum, techniques for
                 compiler correctness based on a domain-specific
                 language for writing optimizations, such as Lerner's
                 Rhodium and Cobalt, make the compiler easy to extend:
                 the correctness of additional transformations can be
                 checked completely automatically. Unfortunately, these
                 systems provide a weaker guarantee since their
                 end-to-end correctness has not been proven fully
                 formally.\par

                 We present an approach for compiler correctness that
                 provides the best of both worlds by bridging the gap
                 between compiler verification and compiler
                 extensibility. In particular, we have extended Leroy's
                 CompCert compiler with an execution engine for
                 optimizations written in a domain specific and proved
                 that this execution engine preserves program semantics,
                 using the Coq proof assistant. We present our CompCert
                 extension, XCert, including the details of its
                 execution engine and proof of correctness in Coq.
                 Furthermore, we report on the important lessons learned
                 for making the proof development manageable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compiler optimization; correctness; extensibility",
}

@Article{Chlipala:2010:UST,
  author =       "Adam Chlipala",
  title =        "{Ur}: statically-typed metaprogramming with type-level
                 record computation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "122--133",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806612",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "{\em Dependent types\/} provide a strong foundation
                 for specifying and verifying rich properties of
                 programs through type-checking. The earliest
                 implementations combined dependency, which allows types
                 to mention program variables; with type-level
                 computation, which facilitates expressive
                 specifications that compute with recursive functions
                 over types. While many recent applications of dependent
                 types omit the latter facility, we argue in this paper
                 that it deserves more attention, even when implemented
                 without dependency.\par

                 In particular, the ability to use functional programs
                 as specifications enables {\em statically-typed
                 metaprogramming\/}: programs write programs, and static
                 type-checking guarantees that the generating process
                 never produces invalid code. Since our focus is on
                 generic validity properties rather than full
                 correctness verification, it is possible to engineer
                 type inference systems that are very effective in
                 narrow domains. As a demonstration, we present Ur, a
                 programming language designed to facilitate
                 metaprogramming with first-class records and names. On
                 top of Ur, we implement Ur/Web, a special standard
                 library that enables the development of modern Web
                 applications. Ad-hoc code generation is already in wide
                 use in the popular Web application frameworks, and we
                 show how that generation may be tamed using types,
                 without forcing metaprogram authors to write proofs or
                 forcing metaprogram users to write any fancy types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dependent types; metaprogramming",
}

@Article{Emmi:2010:PVT,
  author =       "Michael Emmi and Rupak Majumdar and Roman Manevich",
  title =        "Parameterized verification of transactional memories",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "134--145",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806613",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe an automatic verification method to check
                 whether transactional memories ensure strict
                 serializability a key property assumed of the
                 transactional interface. Our main contribution is a
                 technique for effectively verifying parameterized
                 systems. The technique merges ideas from parameterized
                 hardware and protocol verification--verification by
                 invisible invariants and symmetry reduction--with ideas
                 from software verification--template-based invariant
                 generation and satisfiability checking for quantified
                 formul&\#230; (modulo theories). The combination
                 enables us to precisely model and analyze unbounded
                 systems while taming state explosion.\par

                 Our technique enables automated proofs that two-phase
                 locking (TPL), dynamic software transactional memory
                 (DSTM), and transactional locking II (TL2) systems
                 ensure strict serializability. The verification is
                 challenging since the systems are unbounded in several
                 dimensions: the number and length of concurrently
                 executing transactions, and the size of the shared
                 memory they access, have no finite limit. In contrast,
                 state-of-the-art software model checking tools such as
                 BLAST and TVLA are unable to validate either system,
                 due to inherent expressiveness limitations or state
                 explosion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "parameterized verification; transactional memory",
}

@Article{Pizlo:2010:SFT,
  author =       "Filip Pizlo and Lukasz Ziarek and Petr Maj and Antony
                 L. Hosking and Ethan Blanton and Jan Vitek",
  title =        "{SCHISM}: fragmentation-tolerant real-time garbage
                 collection",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "146--159",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806615",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Managed languages such as Java and C\# are being
                 considered for use in hard real-time systems. A hurdle
                 to their widespread adoption is the lack of garbage
                 collection algorithms that offer predictable
                 space-and-time performance in the face of
                 fragmentation. We introduce SCHISM/CMR, a new
                 concurrent and real-time garbage collector that is
                 fragmentation tolerant and guarantees time-and-space
                 worst-case bounds while providing good throughput.
                 SCHISM/CMR combines mark-region collection of
                 fragmented objects and arrays (arraylets) with separate
                 replication-copying collection of immutable arraylet
                 spines, so as to cope with external fragmentation when
                 running in small heaps. We present an implementation of
                 SCHISM/CMR in the Fiji VM, a high-performance Java
                 virtual machine for mission-critical systems, along
                 with a thorough experimental evaluation on a wide
                 variety of architectures, including server-class and
                 embedded systems. The results show that SCHISM/CMR
                 tolerates fragmentation better than previous schemes,
                 with a much more acceptable throughput penalty.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "fragmentation; mark-region; mark-sweep; real-time;
                 replication-copying",
}

@Article{Xu:2010:DIU,
  author =       "Guoqing Xu and Atanas Rountev",
  title =        "Detecting inefficiently-used containers to avoid
                 bloat",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "160--173",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806616",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Runtime bloat degrades significantly the performance
                 and scalability of software systems. An important
                 source of bloat is the inefficient use of containers.
                 It is expensive to create inefficiently-used containers
                 and to invoke their associated methods, as this may
                 ultimately execute large volumes of code, with call
                 stacks dozens deep, and allocate many temporary
                 objects.\par

                 This paper presents practical static and dynamic tools
                 that can find inappropriate use of containers in Java
                 programs. At the core of these tools is a base static
                 analysis that identifies, for each container, the
                 objects that are added to this container and the key
                 statements (i.e., heap loads and stores) that achieve
                 the semantics of common container operations such as
                 {\em ADD\/} and {\em GET}. The static tool finds
                 problematic uses of containers by considering the
                 nesting relationships among the loops where these {\em
                 semantics-achieving statements\/} are located, while
                 the dynamic tool can instrument these statements and
                 find inefficiencies by profiling their execution
                 frequencies.\par

                 The high precision of the base analysis is achieved by
                 taking advantage of a context-free language
                 (CFL)-reachability formulation of points-to analysis
                 and by accounting for container-specific properties. It
                 is demand-driven and client-driven, facilitating
                 refinement specific to each queried container object
                 and increasing scalability. The tools built with the
                 help of this analysis can be used both to avoid the
                 creation of container-related performance problems
                 early during development, and to help with diagnosis
                 when problems are observed during tuning. Our
                 experimental results show that the static tool has a
                 low false positive rate and produces more relevant
                 information than its dynamic counterpart. Further case
                 studies suggest that significant optimization
                 opportunities can be found by focusing on
                 statically-identified containers for which high
                 allocation frequency is observed at run time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "cfl reachability; container bloat; points-to
                 analysis",
}

@Article{Xu:2010:FLU,
  author =       "Guoqing Xu and Nick Mitchell and Matthew Arnold and
                 Atanas Rountev and Edith Schonberg and Gary Sevitsky",
  title =        "Finding low-utility data structures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "174--186",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806617",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many opportunities for easy, big-win, program
                 optimizations are missed by compilers. This is
                 especially true in highly layered Java applications.
                 Often at the heart of these missed optimization
                 opportunities lie computations that, with great
                 expense, produce data values that have little impact on
                 the program's final output. Constructing a new date
                 formatter to format every date, or populating a large
                 set full of expensively constructed structures only to
                 check its size: these involve costs that are out of
                 line with the benefits gained. This disparity between
                 the formation costs and accrued benefits of data
                 structures is at the heart of much runtime
                 bloat.\par

                 We introduce a run-time analysis to discover these {\em
                 low-utility\/} data structures. The analysis employs
                 dynamic thin slicing, which naturally associates costs
                 with value flows rather than raw data flows. It
                 constructs a model of the incremental, hop-to-hop,
                 costs and benefits of each data structure. The analysis
                 then identifies suspicious structures based on
                 imbalances of its incremental costs and benefits. To
                 decrease the memory requirements of slicing, we
                 introduce {\em abstract dynamic thin slicing}, which
                 performs thin slicing over bounded abstract domains. We
                 have modified the IBM J9 commercial JVM to implement
                 this approach.\par

                 We demonstrate two client analyses: one that finds
                 objects that are expensive to construct but are not
                 necessary for the forward execution, and second that
                 pinpoints ultimately-dead values. We have successfully
                 applied them to large-scale and long-running Java
                 applications. We show that these analyses are effective
                 at detecting operations that have unbalanced costs and
                 benefits.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "abstract dynamic thin slicing; cost benefit analysis;
                 memory bloat",
}

@Article{Mytkowicz:2010:EAJ,
  author =       "Todd Mytkowicz and Amer Diwan and Matthias Hauswirth
                 and Peter F. Sweeney",
  title =        "Evaluating the accuracy of {Java} profilers",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "187--197",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806618",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Performance analysts profile their programs to find
                 methods that are worth optimizing: the 'hot' methods.
                 This paper shows that four commonly-used Java profilers
                 ({\em xprof, hprof, jprofile, and yourkit\/}) often
                 disagree on the identity of the hot methods. If two
                 profilers disagree, at least one must be incorrect.
                 Thus, there is a good chance that a profiler will
                 mislead a performance analyst into wasting time
                 optimizing a cold method with little or no performance
                 improvement.\par

                 This paper uses causality analysis to evaluate
                 profilers and to gain insight into the source of their
                 incorrectness. It shows that these profilers all
                 violate a fundamental requirement for sampling based
                 profilers: to be correct, a sampling-based profiler
                 must collect samples randomly.\par

                 We show that a proof-of-concept profiler, which
                 collects samples randomly, does not suffer from the
                 above problems. Specifically, we show, using a number
                 of case studies, that our profiler correctly identifies
                 methods that are important to optimize; in some cases
                 other profilers report that these methods are cold and
                 thus not worth optimizing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "bias; observer effect; profiling",
}

@Article{Baek:2010:GFS,
  author =       "Woongki Baek and Trishul M. Chilimbi",
  title =        "{Green}: a framework for supporting energy-conscious
                 programming using controlled approximation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "198--209",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806620",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy-efficient computing is important in several
                 systems ranging from embedded devices to large scale
                 data centers. Several application domains offer the
                 opportunity to tradeoff quality of service/solution
                 (QoS) for improvements in performance and reduction in
                 energy consumption. Programmers sometimes take
                 advantage of such opportunities, albeit in an ad-hoc
                 manner and often without providing any QoS
                 guarantees.\par

                 We propose a system called Green that provides a simple
                 and flexible framework that allows programmers to take
                 advantage of such approximation opportunities in a
                 systematic manner while providing statistical QoS
                 guarantees. Green enables programmers to approximate
                 expensive functions and loops and operates in two
                 phases. In the calibration phase, it builds a model of
                 the QoS loss produced by the approximation. This model
                 is used in the operational phase to make approximation
                 decisions based on the QoS constraints specified by the
                 programmer. The operational phase also includes an
                 adaptation function that occasionally monitors the
                 runtime behavior and changes the approximation
                 decisions and QoS model to provide strong statistical
                 QoS guarantees.\par

                 To evaluate the effectiveness of Green, we implemented
                 our system and language extensions using the Phoenix
                 compiler framework. Our experiments using benchmarks
                 from domains such as graphics, machine learning, signal
                 processing, and finance, and an in-production,
                 real-world web search engine, indicate that Green can
                 produce significant improvements in performance and
                 energy consumption with small and controlled QoS
                 degradation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "controlled approximation; energy-conscious
                 programming",
}

@Article{Rajan:2010:GPM,
  author =       "Kaushik Rajan and Sriram Rajamani and Shashank
                 Yaduvanshi",
  title =        "{GUESSTIMATE}: a programming model for collaborative
                 distributed systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "210--220",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806621",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new programming model GUESSTIMATE for
                 developing collaborative distributed systems. The model
                 allows atomic, isolated operations that transform a
                 system from consistent state to consistent state, and
                 provides a shared transactional store for a collection
                 of such operations executed by various machines in a
                 distributed system. In addition to 'committed state'
                 which is identical in all machines in the distributed
                 system, GUESSTIMATE allows each machine to have a
                 replicated local copy of the state (called
                 'guesstimated state') so that operations on shared
                 state can be executed locally without any blocking,
                 while also guaranteeing that eventually all machines
                 agree on the sequences of operations executed. Thus,
                 each operation is executed multiple times, once at the
                 time of issue when it updates the guesstimated state of
                 the issuing machine, once when the operation is
                 committed (atomically) to the committed state of all
                 machines, and several times in between as the
                 guesstimated state converges toward the committed
                 state. While we expect the results of these executions
                 of the operation to be identical most of the time in
                 the class of applications we study, it is possible for
                 an operation to succeed the first time when it is
                 executed on the guesstimated state, and fail when it is
                 committed. GUESSTIMATE provides facilities that allow
                 the programmer to deal with this potential discrepancy.
                 This paper presents our programming model, its
                 operational semantics, its realization as an API in
                 C\#, and our experience building collaborative
                 distributed applications with this model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "collaborative applications; concurrency; distributed
                 systems; language extensions",
}

@Article{Xi:2010:CFM,
  author =       "Qian Xi and David Walker",
  title =        "A context-free markup language for semi-structured
                 text",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "221--232",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806622",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "An {\em ad hoc data format\/} is any nonstandard,
                 semi-structured data format for which robust data
                 processing tools are not easily available. In this
                 paper, we present ANNE, a new kind of markup language
                 designed to help users generate documentation and data
                 processing tools for ad hoc text data. More
                 specifically, given a new ad hoc data source, an ANNE
                 programmer edits the document to add a number of simple
                 annotations, which serve to specify its syntactic
                 structure. Annotations include elements that specify
                 constants, optional data, alternatives, enumerations,
                 sequences, tabular data, and recursive patterns. The
                 ANNE system uses a combination of user annotations and
                 the raw data itself to extract a context-free grammar
                 from the document. This context-free grammar can then
                 be used to parse the data and transform it into an XML
                 parse tree, which may be viewed through a browser for
                 analysis or debugging purposes. In addition, the ANNE
                 system generates a PADS/ML description, which may be
                 saved as lasting documentation of the data format or
                 compiled into a host of useful data processing
                 tools.\par

                 In addition to designing and implementing ANNE, we have
                 devised a semantic theory for the core elements of the
                 language. This semantic theory describes the editing
                 process, which translates a raw, unannotated text
                 document into an annotated document, and the grammar
                 extraction process, which generates a context-free
                 grammar from an annotated document. We also present an
                 alternative characterization of system behavior by
                 drawing upon ideas from the field of relevance logic.
                 This secondary characterization, which we call {\em
                 relevance analysis}, specifies a direct relationship
                 between unannotated documents and the context-free
                 grammars that our system can generate from them.
                 Relevance analysis allows us to prove important
                 theorems concerning the expressiveness and utility of
                 our system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "ad hoc data; ANNE; domain-specific languages; PADS;
                 tool generation",
}

@Article{Loitsch:2010:PFP,
  author =       "Florian Loitsch",
  title =        "Printing floating-point numbers quickly and accurately
                 with integers",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "233--243",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806623",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present algorithms for accurately converting
                 floating-point numbers to decimal representation. They
                 are fast (up to 4 times faster than commonly used
                 algorithms that use high-precision integers) and
                 correct: any printed number will evaluate to the same
                 number, when read again.\par

                 Our algorithms are fast, because they require only
                 fixed-size integer arithmetic. The sole requirement for
                 the integer type is that it has at least two more bits
                 than the significand of the floating-point number.
                 Hence, for IEEE 754 double-precision numbers (having a
                 53-bit significand) an integer type with 55 bits is
                 sufficient. Moreover we show how to exploit additional
                 bits to improve the generated output.\par

                 We present three algorithms with different properties:
                 the first algorithm is the most basic one, and does not
                 take advantage of any extra bits. It simply shows how
                 to perform the binary-to-decimal transformation with
                 the minimal number of bits. Our second algorithm
                 improves on the first one by using the additional bits
                 to produce a shorter (often the shortest)
                 result.\par

                 Finally we propose a third version that can be used
                 when the shortest output is a requirement. The last
                 algorithm either produces optimal decimal
                 representations (with respect to shortness and
                 rounding) or rejects its input. For IEEE 754
                 double-precision numbers and 64-bit integers roughly
                 99.4\% of all numbers can be processed efficiently. The
                 remaining 0.6\% are rejected and need to be printed by
                 a slower complete algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dtoa; floating-point printing",
}

@Article{Flanagan:2010:AMD,
  author =       "Cormac Flanagan and Stephen N. Freund",
  title =        "Adversarial memory for detecting destructive races",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "244--254",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806625",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multithreaded programs are notoriously prone to race
                 conditions, a problem exacerbated by the widespread
                 adoption of multi-core processors with complex memory
                 models and cache coherence protocols. Much prior work
                 has focused on static and dynamic analyses for race
                 detection, but these algorithms typically are unable to
                 distinguish destructive races that cause erroneous
                 behavior from benign races that do not. Performing this
                 classification manually is difficult, time consuming,
                 and error prone.\par

                 This paper presents a new dynamic analysis technique
                 that uses {\em adversarial memory\/} to classify race
                 conditions as destructive or benign on systems with
                 relaxed memory models. Unlike a typical language
                 implementation, which may only infrequently exhibit
                 non-sequentially consistent behavior, our adversarial
                 memory implementation exploits the full freedom of the
                 memory model to return older, unexpected, or stale
                 values for memory reads whenever possible, in an
                 attempt to crash the target program (that is, to force
                 the program to behave erroneously). A crashing
                 execution provides concrete evidence of a destructive
                 bug, and this bug can be strongly correlated with a
                 specific race condition in the target
                 program.\par

                 Experimental results with our Jumble prototype for Java
                 demonstrate that adversarial memory is highly effective
                 at identifying destructive race conditions, and in
                 distinguishing them from race conditions that are real
                 but benign. Adversarial memory can also reveal
                 destructive races that would not be detected by
                 traditional testing (even after thousands of runs) or
                 by model checkers that assume sequential consistency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "concurrency; dynamic analysis; race conditions;
                 relaxed memory models",
}

@Article{Bond:2010:PPD,
  author =       "Michael D. Bond and Katherine E. Coons and Kathryn S.
                 McKinley",
  title =        "{PACER}: proportional detection of data races",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "255--268",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806626",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data races indicate serious concurrency bugs such as
                 order, atomicity, and sequential consistency
                 violations. Races are difficult to find and fix, often
                 manifesting only after deployment. The frequency and
                 unpredictability of these bugs will only increase as
                 software adds parallelism to exploit multicore
                 hardware. Unfortunately, sound and precise race
                 detectors slow programs by factors of eight or more and
                 do not scale to large numbers of threads.\par

                 This paper presents a precise, low-overhead {\em
                 sampling-based\/} data race detector called Pacer.
                 PACER makes a {\em proportionality\/} guarantee: it
                 detects any race at a rate equal to the sampling rate,
                 by finding races whose first access occurs during a
                 global sampling period. During sampling, PACER tracks
                 all accesses using the dynamically sound and precise
                 FastTrack algorithm. In nonsampling periods, Pacer
                 discards sampled access information that cannot be part
                 of a reported race, {\em and\/} Pacer simplifies
                 tracking of the happens-before relationship, yielding
                 near-constant, instead of linear, overheads.
                 Experimental results confirm our theoretical
                 guarantees. PACER reports races in proportion to the
                 sampling rate. Its time and space overheads scale with
                 the sampling rate, and sampling rates of 1-3\% yield
                 overheads low enough to consider in production
                 software. The resulting system provides a 'get what you
                 pay for' approach that is suitable for identifying
                 real, hard-to-reproduce races in deployed systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "bugs; concurrency; data races; sampling",
}

@Article{Nakaike:2010:LER,
  author =       "Takuya Nakaike and Maged M. Michael",
  title =        "Lock elision for read-only critical sections in
                 {Java}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "269--278",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806627",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is not uncommon in parallel workloads to encounter
                 shared data structures with read-mostly access
                 patterns, where operations that update data are
                 infrequent and most operations are read-only.
                 Typically, data consistency is guaranteed using mutual
                 exclusion or read-write locks. The cost of atomic
                 update of lock variables result in high overheads and
                 high cache coherence traffic under active sharing, thus
                 slowing down single thread performance and limiting
                 scalability.\par

                 In this paper, we present {\em SOLERO (Software
                 Optimistic Lock Elision for Read-Only critical
                 sections)}, a new lock implementation called for
                 optimizing read-only critical sections in Java based on
                 sequential locks. SOLERO is compatible with the
                 conventional lock implementation of Java. However,
                 unlike the conventional implementation, only critical
                 sections that may write data or have side effects need
                 to update lock variables, while read-only critical
                 sections need only read lock variables without writing
                 them. Each writing critical section changes the lock
                 value to a new value. Hence, a read-only critical
                 section is guaranteed to be consistent if the lock is
                 free and its value does not change from the beginning
                 to the end of the read-only critical section.\par

                 Using Java workloads including SPECjbb2005 and the
                 HashMap and TreeMap Java classes, we evaluate the
                 performance impact of applying SOLERO to read-mostly
                 locks. Our experimental results show performance
                 improvements across the board, often substantial, in
                 both single thread speed and scalability over the
                 conventional lock implementation (mutual exclusion) and
                 read-write locks. SOLERO improves the performance of
                 SPECjbb2005 by 3-5\% on single and multiple threads.
                 The results using the HashMap and TreeMap benchmarks
                 show that SOLERO outperforms the conventional lock
                 implementation and read-write locks by substantial
                 multiples on multi-threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "java; just-in-time compiler; lock; lock elision;
                 monitor; optimization; synchronization",
}

@Article{Chaudhuri:2010:SI,
  author =       "Swarat Chaudhuri and Armando Solar-Lezama",
  title =        "Smooth interpretation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "279--291",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806629",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present {\em smooth interpretation}, a method to
                 systematically approximate numerical imperative
                 programs by smooth mathematical functions. This
                 approximation facilitates the use of numerical search
                 techniques like gradient descent for program analysis
                 and synthesis. The method extends to programs the
                 notion of {\em Gaussian smoothing}, a popular
                 signal-processing technique that filters out noise and
                 discontinuities from a signal by taking its convolution
                 with a Gaussian function.\par

                 In our setting, Gaussian smoothing executes a program
                 according to a probabilistic semantics; the execution
                 of program {\em P\/} on an input {\em x\/} after
                 Gaussian smoothing can be summarized as follows: (1)
                 Apply a Gaussian perturbation to {\em x\/} -- the
                 perturbed input is a random variable following a normal
                 distribution with mean {\em x}. (2) Compute and return
                 the {\em expected output\/} of {\em P\/} on this
                 perturbed input. Computing the expectation explicitly
                 would require the execution of {\em P\/} on all
                 possible inputs, but smooth interpretation bypasses
                 this requirement by using a form of symbolic execution
                 to approximate the effect of Gaussian smoothing on {\em
                 P}. The result is an efficient but approximate
                 implementation of Gaussian smoothing of
                 programs.\par

                 Smooth interpretation has the effect of attenuating
                 features of a program that impede numerical searches of
                 its input space -- for example, discontinuities
                 resulting from conditional branches are replaced by
                 continuous transitions. We apply smooth interpretation
                 to the problem of synthesizing values of numerical
                 control parameters in embedded control applications.
                 This problem is naturally formulated as one of
                 numerical optimization: the goal is to find parameter
                 values that minimize the error between the resulting
                 program and a programmer-provided behavioral
                 specification. Solving this problem by directly
                 applying numerical optimization techniques is often
                 impractical due to the discontinuities in the error
                 function. By eliminating these discontinuities, smooth
                 interpretation makes it possible to search the
                 parameter space efficiently by means of simple gradient
                 descent. Our experiments demonstrate the value of this
                 strategy in synthesizing parameters for several
                 challenging programs, including models of an automated
                 gear shift and a PID controller.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "continuity; parameter synthesis; program smoothing",
}

@Article{Gulwani:2010:RBP,
  author =       "Sumit Gulwani and Florian Zuleger",
  title =        "The reachability-bound problem",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "292--304",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806630",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We define the {\em reachability-bound problem\/} to be
                 the problem of finding a symbolic worst-case bound on
                 the number of times a given control location inside a
                 procedure is visited in terms of the inputs to that
                 procedure. This has applications in bounding resources
                 consumed by a program such as time, memory,
                 network-traffic, power, as well as estimating
                 quantitative properties (as opposed to boolean
                 properties) of data in programs, such as information
                 leakage or uncertainty propagation. Our approach to
                 solving the reachability-bound problem brings together
                 two different techniques for reasoning about loops in
                 an effective manner. One of these techniques is an
                 abstract-interpretation based iterative technique for
                 computing precise disjunctive invariants (to summarize
                 nested loops). The other technique is a non-iterative
                 proof-rules based technique (for loop bound
                 computation) that takes over the role of doing
                 inductive reasoning, while deriving its power from the
                 use of SMT solvers to reason about abstract loop-free
                 fragments.\par

                 Our solution to the reachability-bound problem allows
                 us to compute precise symbolic complexity bounds for
                 several loops in {.NET} base-class libraries for which
                 earlier techniques fail. We also illustrate the
                 precision of our algorithm for disjunctive invariant
                 computation (which has a more general applicability
                 beyond the reachability-bound problem) on a set of
                 benchmark examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "disjunctive invariants; pattern matching; ranking
                 functions; resource bound analysis; transitive
                 closure",
}

@Article{Might:2010:REC,
  author =       "Matthew Might and Yannis Smaragdakis and David {Van
                 Horn}",
  title =        "Resolving and exploiting the $k$-{CFA} paradox:
                 illuminating functional vs. object-oriented program
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "305--315",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806631",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Low-level program analysis is a fundamental problem,
                 taking the shape of 'flow analysis' in functional
                 languages and 'points-to' analysis in imperative and
                 object-oriented languages. Despite the similarities,
                 the vocabulary and results in the two communities
                 remain largely distinct, with limited
                 cross-understanding. One of the few links is Shivers's
                 $k$-CFA work, which has advanced the concept of
                 'context-sensitive analysis' and is widely known in
                 both communities.\par

                 Recent results indicate that the relationship between
                 the functional and object-oriented incarnations of
                 $k$-CFA is not as well understood as thought. Van Horn
                 and Mairson proved $k$-CFA for $k \geq 1$ to be
                 EXPTIME-complete; hence, no polynomial-time algorithm
                 can exist. Yet, there are several polynomial-time
                 formulations of context-sensitive points-to analyses in
                 object-oriented languages. Thus, it seems that
                 functional $k$-CFA may actually be a profoundly
                 different analysis from object-oriented $k$-CFA. We
                 resolve this paradox by showing that the exact same
                 specification of $k$-CFA is polynomial-time for
                 object-oriented languages yet exponential-time for
                 functional ones: objects and closures are subtly
                 different, in a way that interacts crucially with
                 context-sensitivity and complexity. This illumination
                 leads to an immediate payoff: by projecting the
                 object-oriented treatment of objects onto closures, we
                 derive a polynomial-time hierarchy of context-sensitive
                 CFAs for functional programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "control-flow analysis; functional; k-cfa; m-cfa;
                 object-oriented; pointer analysis; static analysis",
}

@Article{Kuncak:2010:CFS,
  author =       "Viktor Kuncak and Mika{\"e}l Mayer and Ruzica Piskac
                 and Philippe Suter",
  title =        "Complete functional synthesis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "316--329",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806632",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Synthesis of program fragments from specifications can
                 make programs easier to write and easier to reason
                 about. To integrate synthesis into programming
                 languages, synthesis algorithms should behave in a
                 predictable way - they should succeed for a
                 well-defined class of specifications. They should also
                 support unbounded data types such as numbers and data
                 structures. We propose to generalize decision
                 procedures into predictable and complete synthesis
                 procedures. Such procedures are guaranteed to find code
                 that satisfies the specification if such code exists.
                 Moreover, we identify conditions under which synthesis
                 will statically decide whether the solution is
                 guaranteed to exist, and whether it is unique. We
                 demonstrate our approach by starting from decision
                 procedures for linear arithmetic and data structures
                 and transforming them into synthesis procedures. We
                 establish results on the size and the efficiency of the
                 synthesized code. We show that such procedures are
                 useful as a language extension with implicit value
                 definitions, and we show how to extend a compiler to
                 support such definitions. Our constructs provide the
                 benefits of synthesis to programmers, without requiring
                 them to learn new concepts or give up a deterministic
                 execution model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "bapa; decision procedure; Presburger arithmetic;
                 synthesis procedure",
}

@Article{Burckhardt:2010:LCA,
  author =       "Sebastian Burckhardt and Chris Dern and Madanlal
                 Musuvathi and Roy Tan",
  title =        "{Line-Up}: a complete and automatic linearizability
                 checker",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "330--340",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806634",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modular development of concurrent applications
                 requires thread-safe components that behave correctly
                 when called concurrently by multiple client threads.
                 This paper focuses on linearizability, a specific
                 formalization of thread safety, where all operations of
                 a concurrent component appear to take effect
                 instantaneously at some point between their call and
                 return. The key insight of this paper is that if a
                 component is intended to be deterministic, then it is
                 possible to build an automatic linearizability checker
                 by systematically enumerating the sequential behaviors
                 of the component and then checking if each its
                 concurrent behavior is equivalent to some sequential
                 behavior.\par

                 We develop this insight into a tool called Line-Up, the
                 first complete and automatic checker for {\em
                 deterministic linearizability}. It is complete, because
                 any reported violation proves that the implementation
                 is not linearizable with respect to {\em any\/}
                 sequential deterministic specification. It is
                 automatic, requiring no manual abstraction, no manual
                 specification of semantics or commit points, no
                 manually written test suites, no access to source
                 code.\par

                 We evaluate Line-Up by analyzing 13 classes with a
                 total of 90 methods in two versions of the {.NET}
                 Framework 4.0. The violations of deterministic
                 linearizability reported by Line-Up exposed seven
                 errors in the implementation that were fixed by the
                 development team.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "atomicity; linearizability; thread safety",
}

@Article{Torlak:2010:MCA,
  author =       "Emina Torlak and Mandana Vaziri and Julian Dolby",
  title =        "{MemSAT}: checking axiomatic specifications of memory
                 models",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "341--350",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806635",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Memory models are hard to reason about due to their
                 complexity, which stems from the need to strike a
                 balance between ease-of-programming and allowing
                 compiler and hardware optimizations. In this paper, we
                 present an automated tool, MemSAT, that helps in
                 debugging and reasoning about memory models. Given an
                 axiomatic specification of a memory model and a
                 multi-threaded test program containing assertions,
                 MemSAT outputs a trace of the program in which both the
                 assertions and the memory model axioms are satisfied,
                 if one can be found. The tool is fully automatic and is
                 based on a SAT solver. If it cannot find a trace, it
                 outputs a minimal subset of the memory model and
                 program constraints that are unsatisfiable. We used
                 MemSAT to check several existing memory models against
                 their published test cases, including the current Java
                 Memory Model by Manson et al. and a revised version of
                 it by Sevcik and Aspinall. We found subtle
                 discrepancies between what was expected and the actual
                 results of test programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "axiomatic specifications; bounded model checking;
                 memory models; sat",
}

@Article{Marino:2010:DSE,
  author =       "Daniel Marino and Abhayendra Singh and Todd Millstein
                 and Madanlal Musuvathi and Satish Narayanasamy",
  title =        "{DRFX}: a simple and efficient memory model for
                 concurrent programming languages",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "351--362",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806636",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The most intuitive memory model for shared-memory
                 multithreaded programming is {\em sequential
                 consistency\/} (SC), but it disallows the use of many
                 compiler and hardware optimizations thereby impacting
                 performance. Data-race-free (DRF) models, such as the
                 proposed C++0x memory model, guarantee SC execution for
                 datarace-free programs. But these models provide no
                 guarantee at all for racy programs, compromising the
                 safety and debuggability of such programs. To address
                 the safety issue, the Java memory model, which is also
                 based on the DRF model, provides a weak semantics for
                 racy executions. However, this semantics is subtle and
                 complex, making it difficult for programmers to reason
                 about their programs and for compiler writers to ensure
                 the correctness of compiler optimizations.\par

                 We present the DRFx memory model, which is simple for
                 programmers to understand and use while still
                 supporting many common optimizations. We introduce a
                 {\em memory model (MM) exception\/} which can be
                 signaled to halt execution. If a program executes
                 without throwing this exception, then DRFx guarantees
                 that the execution is SC. If a program throws an MM
                 exception during an execution, then DRFx guarantees
                 that the program has a data race. We observe that SC
                 violations can be detected in hardware through a
                 lightweight form of conflict detection. Furthermore,
                 our model safely allows aggressive compiler and
                 hardware optimizations within compiler-designated
                 program regions. We formalize our memory model, prove
                 several properties about this model, describe a
                 compiler and hardware design suitable for DRFx, and
                 evaluate the performance overhead due to our compiler
                 and hardware requirements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "data races; memory model exception; memory models;
                 sequential consistency; soft fences",
}

@Article{Chambers:2010:FEE,
  author =       "Craig Chambers and Ashish Raniwala and Frances Perry
                 and Stephen Adams and Robert R. Henry and Robert
                 Bradshaw and Nathan Weizenbaum",
  title =        "{FlumeJava}: easy, efficient data-parallel pipelines",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "363--375",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806638",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "MapReduce and similar systems significantly ease the
                 task of writing data-parallel code. However, many
                 real-world computations require a pipeline of
                 MapReduces, and programming and managing such pipelines
                 can be difficult. We present FlumeJava, a Java library
                 that makes it easy to develop, test, and run efficient
                 data-parallel pipelines. At the core of the FlumeJava
                 library are a couple of classes that represent
                 immutable parallel collections, each supporting a
                 modest number of operations for processing them in
                 parallel. Parallel collections and their operations
                 present a simple, high-level, uniform abstraction over
                 different data representations and execution
                 strategies. To enable parallel operations to run
                 efficiently, FlumeJava defers their evaluation, instead
                 internally constructing an execution plan dataflow
                 graph. When the final results of the parallel
                 operations are eventually needed, FlumeJava first
                 optimizes the execution plan, and then executes the
                 optimized operations on appropriate underlying
                 primitives (e.g., MapReduces). The combination of
                 high-level abstractions for parallel data and
                 computation, deferred evaluation and optimization, and
                 efficient parallel primitives yields an easy-to-use
                 system that approaches the efficiency of hand-optimized
                 pipelines. FlumeJava is in active use by hundreds of
                 pipeline developers within Google.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "data-parallel programming; java; mapreduce",
}

@Article{Pan:2010:CPS,
  author =       "Heidi Pan and Benjamin Hindman and Krste
                 Asanovi{\'c}",
  title =        "Composing parallel software efficiently with {Lithe}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "376--387",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806639",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Applications composed of multiple parallel libraries
                 perform poorly when those libraries interfere with one
                 another by obliviously using the same physical cores,
                 leading to destructive resource oversubscription. This
                 paper presents the design and implementation of {\em
                 Lithe}, a low-level substrate that provides the basic
                 primitives and a standard interface for composing
                 parallel codes efficiently. Lithe can be inserted
                 underneath the runtimes of legacy parallel libraries to
                 provide {\em bolt-on\/} composability without needing
                 to change existing application code. Lithe can also
                 serve as the foundation for building new parallel
                 abstractions and libraries that automatically
                 interoperate with one another.\par

                 In this paper, we show versions of Threading Building
                 Blocks (TBB) and OpenMP perform competitively with
                 their original implementations when ported to Lithe.
                 Furthermore, for two applications composed of multiple
                 parallel libraries, we show that leveraging our
                 substrate outperforms their original, even expertly
                 tuned, implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "composability; cooperative scheduling; hierarchical
                 scheduling; oversubscription; parallelism; resource
                 management; user-level scheduling",
}

@Article{Zhou:2010:BDC,
  author =       "Jin Zhou and Brian Demsky",
  title =        "{Bamboo}: a data-centric, object-oriented approach to
                 many-core software",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "388--399",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806640",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Traditional data-oriented programming languages such
                 as dataflow languages and stream languages provide a
                 natural abstraction for parallel programming. In these
                 languages, a developer focuses on the flow of data
                 through the computation and these systems free the
                 developer from the complexities of low-level,
                 thread-oriented concurrency primitives. This
                 simplification comes at a cost --- traditional
                 data-oriented approaches restrict the mutation of state
                 and, in practice, the types of data structures a
                 program can effectively use. Bamboo borrows from work
                 in typestate and software transactions to relax the
                 traditional restrictions of data-oriented programming
                 models to support mutation of arbitrary data
                 structures.\par

                 We have implemented a compiler for Bamboo which
                 generates code for the TILEPro64 many-core processor.
                 We have evaluated this implementation on six
                 benchmarks: Tracking, a feature tracking algorithm from
                 computer vision; KMeans, a K-means clustering
                 algorithm; MonteCarlo, a Monte Carlo simulation;
                 FilterBank, a multi-channel filter bank; Fractal, a
                 Mandelbrot set computation; and Series, a Fourier
                 series computation. We found that our compiler
                 generated implementations that obtained speedups
                 ranging from 26.2x to 61.6x when executed on 62
                 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "data-centric languages; many-core programming",
}

@Article{Westbrook:2010:MJM,
  author =       "Edwin Westbrook and Mathias Ricken and Jun Inoue and
                 Yilong Yao and Tamer Abdelatif and Walid Taha",
  title =        "{Mint}: {Java} multi-stage programming using weak
                 separability",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "400--411",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806642",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multi-stage programming (MSP) provides a disciplined
                 approach to run-time code generation. In the purely
                 functional setting, it has been shown how MSP can be
                 used to reduce the overhead of abstractions, allowing
                 clean, maintainable code without paying performance
                 penalties. Unfortunately, MSP is difficult to combine
                 with imperative features, which are prevalent in
                 mainstream languages. The central difficulty is scope
                 extrusion, wherein free variables can inadvertently be
                 moved outside the scopes of their binders. This paper
                 proposes a new approach to combining MSP with
                 imperative features that occupies a 'sweet spot' in the
                 design space in terms of how well useful MSP
                 applications can be expressed and how easy it is for
                 programmers to understand. The key insight is that
                 escapes (or 'anti-quotes') must be weakly separable
                 from the rest of the code, i.e. the computational
                 effects occurring inside an escape that are visible
                 outside the escape are guaranteed to not contain code.
                 To demonstrate the feasibility of this approach, we
                 formalize a type system based on Lightweight Java which
                 we prove sound, and we also provide an implementation,
                 called Mint, to validate both the expressivity of the
                 type system and the effect of staging on the
                 performance of Java programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "java; multi-staged languages; multi-stage programming;
                 type systems",
}

@Article{Chen:2010:TPC,
  author =       "Juan Chen and Ravi Chugh and Nikhil Swamy",
  title =        "Type-preserving compilation of end-to-end verification
                 of security enforcement",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "412--423",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806643",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A number of programming languages use rich type
                 systems to verify security properties of code. Some of
                 these languages are meant for source programming, but
                 programs written in these languages are compiled
                 without explicit security proofs, limiting their
                 utility in settings where proofs are necessary, e.g.,
                 proof-carrying authorization. Others languages do
                 include explicit proofs, but these are generally lambda
                 calculi not intended for source programming, that must
                 be further compiled to an executable form. A language
                 suitable for source programming backed by a compiler
                 that enables end-to-end verification is missing.\par

                 In this paper, we present a type-preserving compiler
                 that translates programs written in FINE, a
                 source-level functional language with dependent
                 refinements and affine types, to DCIL, a new extension
                 of the {.NET} Common Intermediate Language. FINE is type
                 checked using an external SMT solver to reduce the
                 proof burden on source programmers. We extract explicit
                 LCF-style proof terms from the solver and carry these
                 proof terms in the compilation to DCIL, thereby
                 removing the solver from the trusted computing base.
                 Explicit proofs enable DCIL to be used in a number of
                 important scenarios, including the verification of
                 mobile code, proof-carrying authorization, and
                 evidence-based auditing. We report on our experience
                 using FINE to build reference monitors for several
                 applications, ranging from a plugin-based email client
                 to a conference management server.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "authorization; bytecode languages; compilers;
                 dependent types; functional programming; information
                 flow; mobile code security; security type systems",
}

@Article{Tate:2010:IOO,
  author =       "Ross Tate and Juan Chen and Chris Hawblitzel",
  title =        "Inferable object-oriented typed assembly language",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "424--435",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806644",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A certifying compiler preserves type information
                 through compilation to assembly language programs,
                 producing typed assembly language (TAL) programs that
                 can be verified for safety independently so that the
                 compiler does not need to be trusted. There are two
                 challenges for adopting certifying compilation in
                 practice. First, requiring every compiler
                 transformation and optimization to preserve types is a
                 large burden on compilers, especially when adopting
                 certifying compilation into existing optimizing
                 non-certifying compilers. Second, type annotations
                 significantly increase the size of assembly language
                 programs.\par

                 This paper proposes an alternative to traditional
                 certifying compilers. It presents iTalX, the first
                 inferable TAL type system that supports existential
                 types, arrays, interfaces, and stacks. We have proved
                 our inference algorithm is complete, meaning if an
                 assembly language program is typeable with iTalX then
                 our algorithm will infer an iTalX typing for that
                 program. Furthermore, our algorithm is guaranteed to
                 terminate even if the assembly language program is
                 untypeable. We demonstrate that it is practical to
                 infer such an expressive TAL by showing a prototype
                 implementation of type inference for code compiled by
                 Bartok, an optimizing C\# compiler. Our prototype
                 implementation infers complete type annotations for
                 98\% of functions in a suite of realistic C\#
                 benchmarks. The type-inference time is about 8\% of the
                 compilation time. We needed to change only 2.5\% of the
                 compiler code, mostly adding new code for defining
                 types and for writing types to object files. Most
                 transformations are untouched. Type-annotation size is
                 only 17\% of the size of pure code and data, reducing
                 type annotations in our previous certifying compiler
                 [4] by 60\%. The compiler needs to preserve only
                 essential type information such as method signatures,
                 object-layout information, and types for static data
                 and external labels. Even non-certifying compilers have
                 most of this information available.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "certifying compiler; existential quantification;
                 object-oriented compiler; Typed Assembly Language
                 (TAL); type inference",
}

@Article{Khoo:2010:MTC,
  author =       "Yit Phang Khoo and Bor-Yuh Evan Chang and Jeffrey S.
                 Foster",
  title =        "Mixing type checking and symbolic execution",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "436--447",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1809028.1806645",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Static analysis designers must carefully balance
                 precision and efficiency. In our experience, many
                 static analysis tools are built around an elegant, core
                 algorithm, but that algorithm is then extensively
                 tweaked to add just enough precision for the coding
                 idioms seen in practice, without sacrificing too much
                 efficiency. There are several downsides to adding
                 precision in this way: the tool's implementation
                 becomes much more complicated; it can be hard for an
                 end-user to interpret the tool's results; and as
                 software systems vary tremendously in their coding
                 styles, it may require significant algorithmic
                 engineering to enhance a tool to perform well in a
                 particular software domain.\par

                 In this paper, we present Mix, a novel system that
                 mixes type checking and symbolic execution. The key
                 aspect of our approach is that these analyses are
                 applied independently on disjoint parts of the program,
                 in an off-the-shelf manner. At the boundaries between
                 nested type checked and symbolically executed code
                 regions, we use special mix rules to communicate
                 information between the off-the-shelf systems. The
                 resulting mixture is a provably sound analysis that is
                 more precise than type checking alone and more
                 efficient than exclusive symbolic execution. In
                 addition, we also describe a prototype implementation,
                 Mixy, for C. Mixy checks for potential null
                 dereferences by mixing a null/non-null type qualifier
                 inference system with a symbolic executor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "false alarms; mix; mixed off-the-shelf analysis; mix
                 rules; precision; symbolic execution; type checking",
}

@Article{Chen:2010:EIO,
  author =       "Yang Chen and Yuanjie Huang and Lieven Eeckhout and
                 Grigori Fursin and Liang Peng and Olivier Temam and
                 Chengyong Wu",
  title =        "Evaluating iterative optimization across 1000
                 datasets",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "448--459",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806647",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "While iterative optimization has become a popular
                 compiler optimization approach, it is based on a
                 premise which has never been truly evaluated: that it
                 is possible to learn the best compiler optimizations
                 across data sets. Up to now, most iterative
                 optimization studies find the best optimizations
                 through repeated runs on the same data set. Only a
                 handful of studies have attempted to exercise iterative
                 optimization on a few tens of data sets.\par

                 In this paper, we truly put iterative compilation to
                 the test for the first time by evaluating its
                 effectiveness across a large number of data sets. We
                 therefore compose KDataSets, a data set suite with 1000
                 data sets for 32 programs, which we release to the
                 public. We characterize the diversity of KDataSets, and
                 subsequently use it to evaluate iterative optimization.
                 We demonstrate that it is possible to derive a robust
                 iterative optimization strategy across data sets: for
                 all 32 programs, we find that there exists at least one
                 combination of compiler optimizations that achieves
                 86\% or more of the best possible speedup across {\em
                 all\/} data sets using Intel's ICC (83\% for GNU's
                 GCC). This optimal combination is program-specific and
                 yields speedups up to 1.71 on ICC and 2.23 on GCC over
                 the highest optimization level (-fast and -O3,
                 respectively). This finding makes the task of
                 optimizing programs across data sets much easier than
                 previously anticipated, and it paves the way for the
                 practical and reliable usage of iterative optimization.
                 Finally, we derive pre-shipping and post-shipping
                 optimization strategies for software vendors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "benchmarking; compiler optimization; iterative
                 optimization",
}

@Article{Kamruzzaman:2010:SDS,
  author =       "Md Kamruzzaman and Steven Swanson and Dean M.
                 Tullsen",
  title =        "Software data spreading: leveraging distributed caches
                 to improve single thread performance",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "460--470",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806648",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Single thread performance remains an important
                 consideration even for multicore, multiprocessor
                 systems. As a result, techniques for improving single
                 thread performance using multiple cores have received
                 considerable attention. This work describes a
                 technique, {\em software data spreading}, that
                 leverages the cache capacity of extra cores and extra
                 sockets rather than their computational resources.
                 Software data spreading is a software-only technique
                 that uses compiler-directed thread migration to
                 aggregate cache capacity across cores and chips and
                 improve performance. This paper describes an automated
                 scheme that applies data spreading to various types of
                 loops. Experiments with a set of SPEC2000, SPEC2006,
                 NAS, and microbenchmark workloads show that data
                 spreading can provide speedup of over 2, averaging 17\%
                 for the SPEC and NAS applications on two systems. In
                 addition, despite using more cores for the same
                 computation, data spreading actually saves power since
                 it reduces access to DRAM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "chip multiprocessors; compilers; single-thread
                 performance",
}

@Article{Sartor:2010:ZRD,
  author =       "Jennifer B. Sartor and Stephen M. Blackburn and Daniel
                 Frampton and Martin Hirzel and Kathryn S. McKinley",
  title =        "{Z}-rays: divide arrays and conquer speed and
                 flexibility",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "471--482",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806649",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Arrays are the ubiquitous organization for indexed
                 data. Throughout programming language evolution,
                 implementations have laid out arrays contiguously in
                 memory. This layout is problematic in space and time.
                 It causes heap fragmentation, garbage collection pauses
                 in proportion to array size, and wasted memory for
                 sparse and over-provisioned arrays. Because of array
                 virtualization in managed languages, an array layout
                 that consists of indirection pointers to fixed-size
                 discontiguous memory blocks can mitigate these problems
                 transparently. This design however incurs significant
                 overhead, but is justified when real-time deadlines and
                 space constraints trump performance.\par

                 This paper proposes {\em z-rays}, a discontiguous array
                 design with flexibility and efficiency. A z-ray has a
                 spine with indirection pointers to fixed-size memory
                 blocks called {\em arraylets}, and uses five
                 optimizations: (1) inlining the first N array bytes
                 into the spine, (2) lazy allocation, (3) zero
                 compression, (4) fast array copy, and (5) arraylet
                 copy-on-write. Whereas discontiguous arrays in prior
                 work improve responsiveness and space efficiency,
                 z-rays combine time efficiency and flexibility. On
                 average, the best z-ray configuration performs within
                 12.7\% of an unmodified Java Virtual Machine on 19
                 benchmarks, whereas previous designs have {\em two to
                 three times\/} higher overheads. Furthermore, language
                 implementers can configure z-ray optimizations for
                 various design goals. This combination of performance
                 and flexibility creates a better building block for
                 past and future array optimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "arraylets; arrays; compression; heap; z-rays",
}

@Article{Acar:2010:TDT,
  author =       "Umut A. Acar and Guy Blelloch and Ruy Ley-Wild and
                 Kanat Tangwongsan and Duru Turkoglu",
  title =        "Traceable data types for self-adjusting computation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "6",
  pages =        "483--496",
  month =        jun,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806596.1806650",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:53:18 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Self-adjusting computation provides an evaluation
                 model where computations can respond automatically to
                 modifications to their data by using a mechanism for
                 propagating modifications through the computation.
                 Current approaches to self-adjusting computation
                 guarantee correctness by recording dependencies in a
                 trace at the granularity of individual memory
                 operations. Tracing at the granularity of memory
                 operations, however, has some limitations: it can be
                 asymptotically inefficient (\eg, compared to optimal
                 solutions) because it cannot take advantage of
                 problem-specific structure, it requires keeping a large
                 computation trace (often proportional to the runtime of
                 the program on the current input), and it introduces
                 moderately large constant factors in practice.\par

                 In this paper, we extend dependence-tracing to work at
                 the granularity of the query and update operations of
                 arbitrary (abstract) data types, instead of just reads
                 and writes on memory cells. This can significantly
                 reduce the number of dependencies that need to be kept
                 in the trace and followed during an update. We define
                 an interface for supporting a traceable version of a
                 data type, which reports the earliest query that
                 depends on (is changed by) revising operations back in
                 time, and implement several such structures, including
                 priority queues, queues, dictionaries, and counters. We
                 develop a semantics for tracing, extend an existing
                 self-adjusting language, $\Delta$ML, and its
                 implementation to support traceable data types, and
                 present an experimental evaluation by considering a
                 number of benchmarks. Our experiments show dramatic
                 improvements on space and time, sometimes by as much as
                 two orders of magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "self-adjusting computation; traceable data types",
}

@Article{Chen:2010:TTT,
  author =       "Peter M. Chen",
  title =        "Transistors to toys: teaching systems to freshmen",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "1--2",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735997.1735998",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "How should we introduce students to the art of system
                 building, and when are students ready to start
                 designing and building interesting systems? In this
                 talk, I describe an experimental course at the
                 University of Michigan that teaches systems to freshmen
                 by having them conceive of, design, and build the
                 hardware and software of a microprocessor-based
                 educational toy. Students in this course build their
                 own microprocessor on an FPGA using a hardware
                 description language. They then write the complete
                 software stack for their toy in assembly language,
                 including device drivers for numerous I/O devices, a
                 simple file system, a graphical user interface, digital
                 audio processing, and application software. By building
                 a substantial system involving hardware, system
                 software, and application software, students gain an
                 appreciation for the complexity and beauty of building
                 computing systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "education",
}

@Article{Pohle:2010:CWM,
  author =       "Aaron Pohle and Bj{\"o}rn D{\"o}bel and Michael
                 Roitzsch and Hermann H{\"a}rtig",
  title =        "Capability wrangling made easy: debugging on a
                 microkernel with {{\tt valgrind}}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "3--12",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837854.1736001",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Not all operating systems are created equal.
                 Contrasting traditional monolithic kernels, there is a
                 class of systems called microkernels more prevalent in
                 embedded systems like cellphones, chip cards or
                 real-time controllers. These kernels offer an
                 abstraction very different from the classical POSIX
                 interface. The resulting unfamiliarity for programmers
                 complicates development and debugging. Valgrind is a
                 well-known debugging tool that virtualizes execution to
                 perform dynamic binary analysis. However, it assumes to
                 run on a POSIX-like kernel and closely interacts with
                 the system to control execution. In this paper we
                 analyze how to adapt Valgrind to a non-POSIX
                 environment and describe our port to the Fiasco. OC
                 microkernel. Additionally, we analyze bug classes that
                 are indigenous to capability systems and show how
                 Valgrind's flexibility can be leveraged to create
                 custom debugging tools detecting these errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "capability; l4; microkernel; valgrind",
}

@Article{Chow:2010:MSR,
  author =       "Jim Chow and Dominic Lucchetti and Tal Garfinkel and
                 Geoffrey Lefebvre and Ryan Gardner and Joshua Mason and
                 Sam Small and Peter M. Chen",
  title =        "Multi-stage replay with {Crosscut}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "13--24",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837854.1736002",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Deterministic record-replay has many useful
                 applications, ranging from fault tolerance and
                 forensics to reproducing and diagnosing bugs. When
                 choosing a record-replay solution, the system
                 administrator must choose a priori how comprehensively
                 to record the execution and at what abstraction level
                 to record it. Unfortunately, these choices may not
                 match well with how the recording is eventually used. A
                 recording may contain too little information to support
                 the end use of replay, or it may contain more sensitive
                 information than is allowed to be shown to the end user
                 of replay. Similarly, fixing the abstraction level at
                 the time of recording often leads to a semantic
                 mismatch with the end use of replay.\par

                 This paper describes how to remedy these problems by
                 adding customizable replay stages to create
                 special-purpose logs for the end users of replay. Our
                 system, called Crosscut, allows replay logs to be
                 'sliced' along time and abstraction boundaries. Using
                 this approach, users can create slices that include
                 only the processes, applications, or components of
                 interest, excluding parts that handle sensitive data.
                 Users can also retarget the abstraction level of the
                 replay log to higher-level platforms, such as Perl or
                 Valgrind. Execution can then be augmented with
                 additional analysis code at replay time, without
                 disturbing the replayed components in the slice.
                 Crosscut thus uses replay itself to transform logs into
                 a more efficient, secure, and usable form for
                 replay-based applications.\par

                 Our current Crosscut prototype builds on VMware
                 Workstation's record-replay capabilities, and supports
                 a variety of different replay environments. We show how
                 Crosscut can create slices of only the parts of the
                 computation of interest and thereby avoid leaking
                 sensitive information, and we show how to retarget the
                 abstraction level of the log to enable more convenient
                 use during replay debugging.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "design; experimentation; performance; replay;
                 security; virtual machines",
}

@Article{Huang:2010:OCD,
  author =       "Yijian Huang and Haibo Chen and Binyu Zang",
  title =        "Optimizing crash dump in virtualized environments",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "25--36",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837854.1736003",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Crash dump, or core dump is the typical way to save
                 memory image on system crash for future offline
                 debugging and analysis. However, for typical server
                 machines with likely abundant memory, the time of core
                 dump can significantly increase the mean time to repair
                 (MTTR) by delaying the reboot-based recovery, while not
                 dumping the failure context for analysis would risk
                 recurring crashes on the same problems.\par

                 In this paper, we propose several optimization
                 techniques for core dump in virtualized environments,
                 in order to shorten the MTTR of consolidated virtual
                 machines during crashes. First, we parallelize the
                 process of crash dump and the process of rebooting the
                 crashed VM, by dynamically reclaiming and allocating
                 memory between the crashed VM and the newly spawned VM.
                 Second, we use the virtual machine management layer to
                 introspect the critical data structures of the crashed
                 VM to filter out the dump of unused memory. Finally, we
                 implement disk I/O rate control between core dump and
                 the newly spawned VM according to user-tuned rate
                 control policy to balance the time of crash dump and
                 quality of services in the recovery VM.\par

                 We have implemented a working prototype, Vicover, that
                 optimizes core dump on system crash of a virtual
                 machine in Xen, to minimize the MTTR of core dump and
                 recovery as a whole. In our experiment on a virtualized
                 TPC-W server, Vicover shortens the downtime caused by
                 crash dump by around 5X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "core dump; parallel core dump; virtual machines",
}

@Article{Hunt:2010:LBS,
  author =       "Galen C. Hunt",
  title =        "Looking beyond a singularity",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "37--38",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837854.1735999",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "How does one build a truly dependable software system?
                 Seven years ago, Microsoft Research started the
                 Singularity project to answer this question. The
                 premise was to start with the best known software
                 development tools and to build a new kind of operating
                 system from the ground up. The operating system was to
                 be both an output artifact and a laboratory for the
                 research. Portions of the code and ideas have been
                 incorporated into three separate Microsoft operating
                 systems so far. I will give a brief overview of
                 Singularity planned and built, then describe what we
                 learned, both positive and negative. I will speculate
                 on OS futures including current research to build an
                 operating system in which every last assembly
                 instruction has been verified for type safety, a system
                 for truly mobile computation, and new tools for
                 automatically restructuring large software systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "sing\#; singularity; software-isolated processes
                 (sips)",
}

@Article{Titzer:2010:ICR,
  author =       "Ben L. Titzer and Thomas W{\"u}rthinger and Doug Simon
                 and Marcelo Cintra",
  title =        "Improving compiler-runtime separation with {XIR}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "39--50",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837854.1736005",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Intense research on virtual machines has highlighted
                 the need for flexible software architectures that allow
                 quick evaluation of new design and implementation
                 techniques. The interface between the compiler and
                 runtime system is a principal factor in the flexibility
                 of both components and is critical to enabling rapid
                 pursuit of new optimizations and features. Although
                 many virtual machines have demonstrated modularity for
                 many components, significant dependencies often remain
                 between the compiler and the runtime system components
                 such as the object model and memory management system.
                 This paper addresses this challenge with a carefully
                 designed strict compiler-runtime interface and the XIR
                 language. Instead of the compiler backend lowering
                 object operations to machine operations using
                 hard-wired runtime-specific logic, XIR allows the
                 runtime system to implement this logic, simultaneously
                 simplifying and separating the backend from
                 runtime-system details. In this paper we describe the
                 design and implementation of this compiler-runtime
                 interface and the XIR language in the C1X dynamic
                 compiler, a port of the HotSpotTM Client compiler. Our
                 results show a significant reduction in backend
                 complexity with XIR and an overall reduction in the
                 compiler-runtime interface complexity while still
                 generating comparable quality code with only minor
                 impact on compilation time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compilers; intermediate representations; java; JIT;
                 lowering; object model; register allocation; runtime
                 interface; software architecture; virtual machines",
}

@Article{Geoffray:2010:VSM,
  author =       "Nicolas Geoffray and Ga{\"e}l Thomas and Julia Lawall
                 and Gilles Muller and Bertil Folliot",
  title =        "{VMKit}: a substrate for managed runtime
                 environments",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "51--62",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735997.1736006",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Managed Runtime Environments (MREs), such as the JVM
                 and the CLI, form an attractive environment for program
                 execution, by providing portability and safety, via the
                 use of a bytecode language and automatic memory
                 management, as well as good performance, via
                 just-in-time (JIT) compilation. Nevertheless,
                 developing a fully featured MRE, including e.g. a
                 garbage collector and JIT compiler, is a herculean
                 task. As a result, new languages cannot easily take
                 advantage of the benefits of MREs, and it is difficult
                 to experiment with extensions of existing MRE based
                 languages.\par

                 This paper describes and evaluates VMKit, a first
                 attempt to build a common substrate that eases the
                 development of high-level MREs. We have successfully
                 used VMKit to build two MREs: a Java Virtual Machine
                 and a Common Language Runtime. We provide an extensive
                 study of the lessons learned in developing this
                 infrastructure, and assess the ease of implementing new
                 MREs or MRE extensions and the resulting performance.
                 In particular, it took one of the authors only one
                 month to develop a Common Language Runtime using VMKit.
                 VMKit furthermore has performance comparable to the
                 well-established open-source MREs Cacao, Apache Harmony
                 and Mono, and is 1.2 to 3 times slower than JikesRVM on
                 most of the Dacapo benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "just in time compiler; virtual machine; VMKit",
}

@Article{Zhang:2010:NSS,
  author =       "Qing Zhang and John McCullough and Justin Ma and Nabil
                 Schear and Michael Vrable and Amin Vahdat and Alex
                 C. Snoeren and Geoffrey M. Voelker and Stefan Savage",
  title =        "{Neon}: system support for derived data management",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "63--74",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735997.1736008",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern organizations face increasingly complex
                 information management requirements. A combination of
                 commercial needs, legal liability and regulatory
                 imperatives has created a patchwork of mandated
                 policies. Among these, personally identifying customer
                 records must be carefully access-controlled, sensitive
                 files must be encrypted on mobile computers to guard
                 against physical theft, and intellectual property must
                 be protected from both exposure and 'poisoning.'
                 However, enforcing such policies can be quite difficult
                 in practice since users routinely share data over
                 networks and derive new files from these
                 inputs--incidentally laundering any policy
                 restrictions. In this paper, we describe a virtual
                 machine monitor system called Neon that transparently
                 labels derived data using byte-level 'tints' and tracks
                 these labels end to end across commodity applications,
                 operating systems and networks. Our goal with Neon is
                 to explore the viability and utility of transparent
                 information flow tracking within conventional networked
                 systems when used in the manner in which they were
                 intended. We demonstrate that this mechanism allows the
                 enforcement of a variety of data management policies,
                 including data-dependent confinement, mandatory I/O
                 encryption, and intellectual property management.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "difc; memory tainting; qemu; virtualization; xen",
}

@Article{Ye:2010:EES,
  author =       "Lei Ye and Gen Lu and Sushanth Kumar and Chris Gniady
                 and John H. Hartman",
  title =        "Energy-efficient storage in virtual machine
                 environments",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "75--84",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735997.1736009",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Current trends in increasing storage capacity and
                 virtualization of resources combined with the need for
                 energy efficiency put a challenging task in front of
                 system designers. Previous studies have suggested many
                 approaches to reduce hard disk energy dissipation in
                 native OS environments; however, those mechanisms do
                 not perform well in virtual machine environments
                 because a virtual machine (VM) and the virtual machine
                 monitor (VMM) that runs it have different semantic
                 contexts. This paper explores the disk I/O activities
                 between VMM and VMs using trace driven simulation to
                 understand the I/O behavior of the VM system.
                 Subsequently, this paper proposes three mechanisms to
                 address the isolation between VMM and VMs, and increase
                 the burstiness of hard disk accesses to increase energy
                 efficiency of a hard disk. Compared to standard
                 shutdown mechanisms, with eight VMs the proposed
                 mechanisms reduce disk spin-ups, increase the disk
                 sleep time, and reduce energy consumption by 14.8\%
                 with only 0.5\% increase in execution time. We
                 implemented the proposed mechanisms in Xen and
                 validated our simulation results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "energy management; storage system; virtual machine",
}

@Article{Kazempour:2010:AAA,
  author =       "Vahid Kazempour and Ali Kamali and Alexandra
                 Fedorova",
  title =        "{AASH}: an asymmetry-aware scheduler for hypervisors",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "85--96",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837854.1736011",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Asymmetric multicore processors (AMP) consist of cores
                 exposing the same instruction-set architecture (ISA)
                 but varying in size, frequency, power consumption and
                 performance. AMPs were shown to be more power efficient
                 than conventional symmetric multicore processors, and
                 it is therefore likely that future multicore systems
                 will include cores of different types. AMPs derive
                 their efficiency from core specialization: instruction
                 streams can be assigned to run on the cores best suited
                 to their demands for architectural resources. System
                 efficiency is improved as a result. To perform
                 effective matching of threads to cores, the thread
                 scheduler must be asymmetry-aware; and while
                 asymmetry-aware schedulers for operating systems are a
                 well studied topic, asymmetry-awareness in hypervisors
                 has not been addressed. A hypervisor must be
                 asymmetry-aware to enable proper functioning of
                 asymmetry-aware guest operating systems; otherwise they
                 will be ineffective in virtual environments.
                 Furthermore, a hypervisor must ensure that asymmetric
                 cores are shared among multiple guests in a fair
                 fashion or in accordance with their
                 priorities.\par

                 This work for the first time implements simple changes
                 to the hypervisor scheduler, required to make it
                 asymmetry-aware, and evaluates the benefits and
                 overheads of these asymmetry-aware mechanisms. Our
                 evaluation was performed using an open source
                 hypervisor Xen on a real multicore system where
                 asymmetry was emulated via CPU frequency scaling. We
                 compared the asymmetry-aware hypervisor to default Xen.
                 Our results indicate that asymmetry support can be
                 implemented with low overheads, and resulting
                 performance improvements can be significant, reaching
                 up to 36\% in our experiments. Most performance
                 improvements are derived from the fact that an
                 asymmetry-aware hypervisor ensures that the fast cores
                 do not go idle before slow cores and from the fact that
                 it maps virtual cores to physical cores for
                 asymmetry-aware guests according to the guest's
                 expectations. Other benefits from asymmetry awareness
                 are fairer sharing of computing resources among VMs and
                 more stable execution times.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "asymmetric; heterogeneous; hypervisor; multicore
                 processors; scheduling algorithms; virtual machine
                 monitor",
}

@Article{Lee:2010:SSR,
  author =       "Min Lee and A. S. Krishnakumar and P. Krishnan and
                 Navjot Singh and Shalini Yajnik",
  title =        "Supporting soft real-time tasks in the {Xen}
                 hypervisor",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "97--108",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735997.1736012",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Virtualization technology enables server consolidation
                 and has given an impetus to low-cost green data
                 centers. However, current hypervisors do not provide
                 adequate support for real-time applications, and this
                 has limited the adoption of virtualization in some
                 domains. Soft real-time applications, such as
                 media-based ones, are impeded by components of
                 virtualization including low-performance virtualization
                 I/O, increased scheduling latency, and shared-cache
                 contention. The virtual machine scheduler is central to
                 all these issues. The goal in this paper is to adapt
                 the virtual machine scheduler to be more soft-real-time
                 friendly.\par

                 We improve two aspects of the VMM scheduler -- managing
                 scheduling latency as a first-class resource and
                 managing shared caches. We use enterprise IP telephony
                 as an illustrative soft real-time workload and design a
                 scheduler S that incorporates the knowledge of soft
                 real-time applications in {\em all\/} aspects of the
                 scheduler to support responsiveness. For this we first
                 define a {\em laxity\/} value that can be interpreted
                 as the target scheduling latency that the workload
                 desires. The load balancer is also designed to minimize
                 the latency for real-time tasks. For cache management,
                 we take cache-affinity into account for real time tasks
                 and load-balance accordingly to prevent cache
                 thrashing. We measured cache misses and demonstrated
                 that cache management is essential for soft real time
                 tasks. Although our scheduler S employs a different
                 design philosophy, interestingly enough it can be
                 implemented with simple modifications to the Xen
                 hypervisor's credit scheduler. Our experiments
                 demonstrate that the Xen scheduler with our
                 modifications can support soft real-time guests well,
                 without penalizing non-real-time domains.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "enterprise telephony workloads; laxity; server
                 consolidation; virtualization; xen",
}

@Article{Odaira:2010:ERT,
  author =       "Rei Odaira and Kazunori Ogata and Kiyokuni Kawachiya
                 and Tamiya Onodera and Toshio Nakatani",
  title =        "Efficient runtime tracking of allocation sites in
                 {Java}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "109--120",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837854.1736014",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Tracking the allocation site of every object at
                 runtime is useful for reliable, optimized Java. To be
                 used in production environments, the tracking must be
                 accurate with minimal speed loss. Previous approaches
                 suffer from performance degradation due to the
                 additional field added to each object or track the
                 allocation sites only probabilistically. We propose two
                 novel approaches to track the allocation sites of every
                 object in Java with only a 1.0\% slow-down on average.
                 Our first approach, the {\em
                 Allocation-Site-as-a-Hash-code (ASH) Tracker}, encodes
                 the allocation site ID of an object into the hash code
                 field of its header by regarding the ID as part of the
                 hash code. ASH Tracker avoids an excessive increase in
                 hash code collisions by dynamically shrinking the
                 bit-length of the ID as more and more objects are
                 allocated at that site. For those Java VMs without the
                 hash code field, our second approach, the {\em
                 Allocation-Site-via-a-Class-pointer (ASC) Tracker},
                 makes the class pointer field in an object header refer
                 to the allocation site structure of the object, which
                 in turn points to the actual class structure. ASC
                 Tracker mitigates the indirection overhead by
                 constant-class-field duplication and allocation-site
                 equality checks. While a previous approach of adding a
                 4-byte field caused up to 14.4\% and an average 5\%
                 slowdown, both ASH and ASC Trackers incur at most a
                 2.0\% and an average 1.0\% loss. We demonstrate the
                 usefulness of our low-overhead trackers by an
                 allocation-site-aware memory leak detector and
                 allocation-site-based pretenuring in generational GC.
                 Our pretenuring achieved on average 1.8\% and up to
                 11.8\% speedups in SPECjvm2008.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "allocation site; hash code; memory allocation",
}

@Article{Tatsubori:2010:EJT,
  author =       "Michiaki Tatsubori and Akihiko Tozawa and Toyotaro
                 Suzumura and Scott Trent and Tamiya Onodera",
  title =        "Evaluation of a just-in-time compiler retrofitted for
                 {PHP}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "121--132",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735997.1736015",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmers who develop Web applications often use
                 dynamic scripting languages such as Perl, PHP, Python,
                 and Ruby. For general purpose scripting language usage,
                 interpreter-based implementations are efficient and
                 popular but the server-side usage for Web application
                 development implies an opportunity to significantly
                 enhance Web server throughput. This paper summarizes a
                 study of the optimization of PHP script processing. We
                 developed a PHP processor, P9, by adapting an existing
                 production-quality just-in-time (JIT) compiler for a
                 Java virtual machine, for which optimization
                 technologies have been well-established, especially for
                 server-side application. This paper describes and
                 contrasts microbenchmarks and SPECweb2005 benchmark
                 results for a well-tuned configuration of a traditional
                 PHP interpreter and our JIT compiler-based
                 implementation, P9. Experimental results with the
                 microbenchmarks show 2.5-9.5x advantage with P9, and
                 the SPECweb2005 measurements show about 20-30\%
                 improvements. These results show that the acceleration
                 of dynamic scripting language processing does matter in
                 a realistic Web application server environment. CPU
                 usage profiling shows our simple JIT compiler
                 introduction reduces the PHP core runtime overhead from
                 45\% to 13\% for a SPECweb2005 scenario, implying that
                 further improvements of dynamic compilers would provide
                 little additional return unless other major overheads
                 such as heavy memory copy between the language runtime
                 and Web server frontend are reduced.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic scripting languages; just-in-time compiler;
                 php",
}

@Article{Namjoshi:2010:NOP,
  author =       "Manjiri A. Namjoshi and Prasad A. Kulkarni",
  title =        "Novel online profiling for virtual machines",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "133--144",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735997.1736016",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Application {\em profiling\/} is a popular technique
                 to improve program performance based on its behavior.
                 {\em Offline\/} profiling, although beneficial for
                 several applications, fails in cases where prior
                 program runs may not be feasible, or if changes in
                 input cause the profile to not match the behavior of
                 the actual program run. Managed languages, like Java
                 and C\\#, provide a unique opportunity to overcome the
                 drawbacks of offline profiling by generating the
                 profile information online during the current program
                 run. Indeed, online profiling is extensively used in
                 current VMs, especially during selective compilation to
                 improve program {\em startup\/} performance, as well as
                 during other feedback-directed optimizations.\par

                 In this paper we illustrate the drawbacks of the
                 current {\em reactive\/} mechanism of online profiling
                 during selective compilation. Current VM profiling
                 mechanisms are slow -- thereby delaying associated
                 transformations, and estimate future behavior based on
                 the program's immediate past -- leading to potential
                 misspeculation that limit the benefits of compilation.
                 We show that these drawbacks produce an average
                 performance loss of over 14.5\% on our set of benchmark
                 programs, over an {\em ideal offline\/} approach that
                 accurately compiles the hot methods early. We then
                 propose and evaluate the potential of a novel strategy
                 to achieve similar performance benefits with an online
                 profiling approach. Our new online profiling strategy
                 uses early determination of loop iteration bounds to
                 predict future method hotness. We explore and present
                 promising results on the potential, feasibility, and
                 other issues involved for the successful implementation
                 of this approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "java; online profiling; virtual machines",
}

@Article{Guha:2010:DPS,
  author =       "Apala Guha and Kim hazelwood and Mary Lou Soffa",
  title =        "{DBT} path selection for holistic memory efficiency
                 and performance",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "145--156",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837854.1736018",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic binary translators(DBTs) provide powerful
                 platforms for building dynamic program monitoring and
                 adaptation tools. DBTs, however, have high memory
                 demands because they cache translated code and
                 auxiliary code to a software code cache and must also
                 maintain data structures to support the code cache. The
                 high memory demands make it difficult for
                 memory-constrained embedded systems to take advantage
                 of DBT-based tools. Previous research on DBT memory
                 management focused on the translated code and auxiliary
                 code only. However, we found that data structures are
                 comparable to the code cache in size. We show that the
                 translated code size, auxiliary code size and the data
                 structure size interact in a complex manner, depending
                 on the path selection (trace selection and link
                 formation) strategy. Therefore, holistic memory
                 efficiency (comprising translated code, auxiliary code
                 and data structures) cannot be improved by focusing on
                 the code cache only. In this paper, we use path
                 selection for improving holistic memory efficiency
                 which in turn impacts performance in memory-constrained
                 environments. Although there has been previous research
                 on path selection, such research only considered
                 performance in memory-unconstrained
                 environments.\par

                 The challenge for holistic memory efficiency is that
                 the path selection strategy results in complex
                 interactions between the memory demand components.
                 Also, individual aspects of path selection and the
                 holistic memory efficiency may impact performance in
                 complex ways. We explore these interactions to motivate
                 path selection targeting holistic memory demand. We
                 enumerate all the aspects involved in a path selection
                 design and evaluate a comprehensive set of approaches
                 for each aspect. Finally, we propose a path selection
                 strategy that reduces memory demands by 20\% and at the
                 same time improves performance by 5-20\% compared to an
                 industrial-strength DBT.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic binary translation; embedded systems; memory
                 management; path selection; virtual machines",
}

@Article{Kondoh:2010:DBT,
  author =       "Goh Kondoh and Hideaki Komatsu",
  title =        "Dynamic binary translation specialized for embedded
                 systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "7",
  pages =        "157--166",
  month =        jul,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1735997.1736019",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:01 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes the design and implementation of
                 a novel dynamic binary translation technique
                 specialized for embedded systems. Virtual platforms
                 have been widely used to develop embedded software and
                 dynamic binary translation is essential to boost their
                 speed in simulations. However, unlike application
                 simulation, the code generated for systems simulation
                 is still slow because the simulator must replicate all
                 of the functions of the target hardware. Embedded
                 systems, which focus on providing one or a few
                 functions, utilize only a small portion of the
                 processor's features most of the time. For example,
                 they may use a Memory Management Unit (MMU) in a
                 processor to map physical memory to effective
                 addresses, but they may not need paged memory support
                 as in an OS. We can exploit this to specialize the
                 dynamically translated code for more
                 performance.\par

                 We built a specialization framework on top of a
                 functional simulator with a dynamic binary translator.
                 Using the framework, we implemented three specializers
                 for an MMU, bi-endianness, and register banks.
                 Experiments with the EEMBC1.1 benchmark showed that the
                 speed of the specialized code was up to 39\% faster
                 than the unspecialized code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic binary translation; embedded systems; partial
                 evaluation; specialization",
}

@Article{Barabash:2010:TGC,
  author =       "Katherine Barabash and Erez Petrank",
  title =        "Tracing garbage collection on highly parallel
                 platforms",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "1--10",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837855.1806653",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The pervasiveness of multiprocessor and multicore
                 hardware and the rising level of available parallelism
                 are radically changing the computing landscape. Can
                 software deal with tomorrow's potential higher
                 parallelism? In this paper we study this issue from the
                 garbage collection perspective. In particular, we
                 investigate the scalability of parallel heap tracing,
                 which stands at the core of the garbage collection
                 activity. Heap shapes can be sequential in nature, and
                 prevent the collector from scaling the trace. We start
                 by proposing the idealized trace utilization as a
                 scalability measure for evaluating the scalability of a
                 given heap shape. We then examine standard Java
                 benchmarks and evaluate the existence of non-scalable
                 object-graph shapes in their execution. Next, we
                 propose and implement a prototype of garbage collection
                 techniques that attempt to ameliorate the object-graph
                 shape problem. Finally, we measure and report their
                 efficacy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "garbage collection; memory management; parallel
                 garbage collection; runtime systems",
}

@Article{Siebert:2010:CPR,
  author =       "Fridtjof Siebert",
  title =        "Concurrent, parallel, real-time garbage-collection",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "11--20",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837855.1806654",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the current developments in CPU implementations,
                 it becomes obvious that ever more parallel multicore
                 systems will be used even in embedded controllers that
                 require real-time guarantees. When garbage collection
                 is used in these systems, parallel and concurrent
                 garbage collection brings important performance
                 advantages in the average case. In a real-time system,
                 however, guarantees on the GC's performance in the
                 worst case are required.\par

                 This paper explains how the single-CPU real-time GC of
                 the Java implementation JamaicaVM was changed to make
                 it a hard real-time garbage collector that is parallel
                 and concurrent. Parallel means that an arbitrary number
                 of CPUs may perform GC work in parallel, while
                 concurrent means that the GC work can be performed
                 concurrently to the application code without
                 pre-empting the application. In addition, the single
                 units of work that this garbage collector has to
                 perform are very small and uniform and the total amount
                 of GC work is bounded by a function of the heap size,
                 such that it becomes possible for any application that
                 has a bounded amount of reachable memory to run the GC
                 work such that sufficient GC progress can be ensured
                 for the application never to run out of heap space.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "concurrent; garbage collection; java; multicore;
                 parallel; real-time",
}

@Article{Anderson:2010:OPN,
  author =       "Todd A. Anderson",
  title =        "Optimizations in a private nursery-based garbage
                 collector",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "21--30",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837855.1806655",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes a garbage collector designed
                 around the use of permanent, private, thread-local
                 nurseries and is principally oriented towards
                 functional languages. We try to maximize the cache hit
                 rate by having threads continually reuse their
                 individual private nurseries. These private nurseries
                 operate in such a way that they can be garbage
                 collected independently of other threads, which creates
                 low collection pause times. Objects which survive
                 thread-local collections are moved to a mature
                 generation that can be collected either concurrently or
                 in a stop-the-world fashion. We describe several
                 optimizations (including two dynamic control parameter
                 adaptation schemes) related to garbage collecting the
                 private nurseries and to our concurrent collector, some
                 of which are made possible when the language provides
                 mutability information. We tested our collector against
                 six benchmarks and saw single-threaded performance
                 improvements in the range of 5-74\%. We also saw a 10x
                 increase (for 24 processors) in scalability for one
                 parallel benchmark that had previously been
                 memory-bound.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "functional languages; garbage collection",
}

@Article{Nagarakatte:2010:CCE,
  author =       "Santosh Nagarakatte and Jianzhou Zhao and Milo M. K.
                 Martin and Steve Zdancewic",
  title =        "{CETS}: compiler enforced temporal safety for {C}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "31--40",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837855.1806657",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Temporal memory safety errors, such as dangling
                 pointer dereferences and double frees, are a prevalent
                 source of software bugs in unmanaged languages such as
                 C. Existing schemes that attempt to retrofit temporal
                 safety for such languages have high runtime overheads
                 and/or are incomplete, thereby limiting their
                 effectiveness as debugging aids. This paper presents
                 CETS, a compile-time transformation for detecting all
                 violations of temporal safety in C programs. Inspired
                 by existing approaches, CETS maintains a unique
                 identifier with each object, associates this metadata
                 with the pointers in a disjoint metadata space to
                 retain memory layout compatibility, and checks that the
                 object is still allocated on pointer dereferences. A
                 formal proof shows that this is sufficient to provide
                 temporal safety even in the presence of arbitrary casts
                 if the program contains no spatial safety violations.
                 Our CETS prototype employs both temporal check removal
                 optimizations and traditional compiler optimizations to
                 achieve a runtime overhead of just 48\% on average.
                 When combined with a spatial-checking system, the
                 average overall overhead is 116\% for complete memory
                 safety",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "c; dangling pointers; memory safety; temporal errors",
}

@Article{Vechev:2010:PPC,
  author =       "Martin Vechev and Eran Yahav and Greta Yorsh",
  title =        "{PHALANX}: parallel checking of expressive heap
                 assertions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "41--50",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837855.1806658",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Unrestricted use of heap pointers makes software
                 systems difficult to understand and to debug. To
                 address this challenge, we developed PHALANX -- a
                 practical framework for dynamically checking expressive
                 heap properties such as ownership, sharing and
                 reachability. PHALANX uses novel parallel algorithms to
                 efficiently check a wide range of heap properties
                 utilizing the available cores.\par

                 PHALANX runtime is implemented on top of IBM's Java
                 production virtual machine. This has enabled us to
                 apply our new techniques to real world software. We
                 checked expressive heap properties in various scenarios
                 and found the runtime support to be valuable for
                 debugging and program understanding. Further, our
                 experimental results on DaCapo and other benchmarks
                 indicate that evaluating heap queries using parallel
                 algorithms can lead to significant performance
                 improvements, often resulting in linear speedups as the
                 number of cores increases.\par

                 To encourage adoption by programmers, we extended an
                 existing JML compiler to translate expressive JML
                 assertions about the heap into their efficient
                 implementation provided by PHALANX. To debug her
                 program, a programmer can annotate it with expressive
                 heap assertions in JML, that are efficiently checked by
                 PHALANX.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "ownership; parallel garbage collector; virtual
                 machine",
}

@Article{Sewell:2010:MEA,
  author =       "Peter Sewell",
  title =        "Memory, an elusive abstraction",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "51--52",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806660",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multiprocessors are now ubiquitous. They provide an
                 abstraction of shared memory, accessible by
                 concurrently executing threads, which supports a wide
                 range of software. However, exactly what this key
                 abstraction is -- what the hardware designers
                 implement, and what programmers can depend on -- is
                 surprisingly elusive. In 1979, when articulating the
                 notion of sequential consistency (SC), Lamport wrote
                 'For some applications, achieving sequential
                 consistency may not be worth the price of slowing down
                 the processors.' [7], and indeed most major
                 multiprocessor families, including Alpha, ARM, Itanium,
                 Power, Sparc, and x86, do not provide the abstraction
                 of SC memory. Internally, they incorporate a range of
                 sophisticated optimisations which have various
                 programmer-visible effects. For some (such as Sparc)
                 these effects are captured in a well-defined relaxed
                 memory model, making it possible (if challenging) to
                 reason with confidence about the behaviour of
                 concurrent programs. For others, however, it has been
                 very unclear what a reasonable model is, despite
                 extensive research over the last three decades. In this
                 talk, I will reflect on the experience of my colleagues
                 and I in trying to establish usable models for x86
                 multiprocessors, where it appears that our x86-TSO
                 model suffices for common-case code [1-4], and for
                 Power and ARM multiprocessors, where we have models
                 that capture some but not all aspects of their
                 behaviour [5,6]. The underlying causes of these
                 difficulties are complex, including:\par

                 The programmer-observable relaxed-memory behaviour of a
                 multiprocessor is a whole-system property that arises
                 from the interaction between many complex aspects of
                 the processor implementation: speculative execution,
                 store buffering, cache protocol, and so forth.

                 Programs are executed (and tested) on specific
                 multiprocessor implementations, but processor vendors
                 attempt to document loose specifications to cover a
                 range of possible (past and future)
                 implementations

                 Multiprocessor implementation details are typically
                 confidential and may change radically from one
                 implementation to another

                 Vendor specifications suffer from the tension between
                 the need for loose specification, to preserve freedom
                 for such changes, and the need for tight specification,
                 to give strong properties to properties

                 All too often, loose specification has been achieved by
                 vague specification, using informal prose. When it
                 comes to subtle concurrent properties this is almost
                 inevitably ambiguous; it also makes it impossible (even
                 in principle) to test conformance between a processor
                 implementation and such a specification, let alone to
                 verify such a correspondence or to reason about
                 concurrent programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "relaxed memory models; semantics",
}

@Article{Petricek:2010:CHG,
  author =       "Tomas Petricek and Don Syme",
  title =        "Collecting {Hollywood}'s garbage: avoiding space-leaks
                 in composite events",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "53--62",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1837855.1806662",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The reactive programming model is largely different to
                 what we're used to as we don't have full control over
                 the application's control flow. If we mix the
                 declarative and imperative programming style, which is
                 usual in the ML family of languages, the situation is
                 even more complex. It becomes easy to introduce
                 patterns where the usual garbage collector for objects
                 cannot automatically dispose all components that we
                 intuitively consider garbage.\par

                 In this paper we discuss a duality between the
                 definitions of garbage for {\em objects\/} and {\em
                 events}. We combine them into a single one, to specify
                 the notion of garbage for reactive programming model in
                 a mixed functional/imperative language and we present a
                 formal algorithm for collecting garbage in this
                 environment.\par

                 Building on top of the theoretical model, we implement
                 a library for reactive programming that does not cause
                 leaks when used in the mixed declarative/imperative
                 model. The library allows us to safely combine both of
                 the reactive programming patterns. As a result, we can
                 take advantage of the clarity and simplicity of the
                 declarative approach as well as the expressivity of the
                 imperative model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "combinator libraries; duality; event-driven; garbage
                 collection; inversion of control; reactive
                 programming",
}

@Article{Tian:2010:SPU,
  author =       "Chen Tian and Min Feng and Rajiv Gupta",
  title =        "Speculative parallelization using state separation and
                 multiple value prediction",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "63--72",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806663",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the availability of chip multiprocessor (CMP) and
                 simultaneous multithreading (SMT) machines, extracting
                 thread level parallelism from a sequential program has
                 become crucial for improving performance. However, many
                 sequential programs cannot be easily parallelized due
                 to the presence of dependences. To solve this problem,
                 different solutions have been proposed. Some of them
                 make the optimistic assumption that such dependences
                 rarely manifest themselves at runtime. However, when
                 this assumption is violated, the recovery causes very
                 large overhead. Other approaches incur large
                 synchronization or computation overhead when resolving
                 the dependences. Consequently, for a loop with
                 frequently arising cross-iteration dependences,
                 previous techniques are not able to speed up the
                 execution. In this paper we propose a compiler
                 technique which uses state separation and multiple
                 value prediction to speculatively parallelize loops in
                 sequential programs that contain frequently arising
                 cross-iteration dependences. The key idea is to
                 generate multiple versions of a loop iteration based on
                 multiple predictions of values of variables involved in
                 cross-iteration dependences (i.e., live-in variables).
                 These speculative versions and the preceding loop
                 iteration are executed in separate memory states
                 simultaneously. After the execution, if one of these
                 versions is correct (i.e., its predicted values are
                 found to be correct), then we merge its state and the
                 state of the preceding iteration because the dependence
                 between the two iterations is correctly resolved. The
                 memory states of other incorrect versions are
                 completely discarded. Based on this idea, we further
                 propose a runtime adaptive scheme that not only gives a
                 good performance but also achieves better CPU
                 utilization. We conducted experiments on 10 benchmark
                 programs on a real machine. The results show that our
                 technique can achieve 1.7x speedup on average across
                 all used benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "multicore processors; speculative parallelization",
}

@Article{Ugawa:2010:IRB,
  author =       "Tomoharu Ugawa and Hideya Iwasaki and Taiichi Yuasa",
  title =        "Improved replication-based incremental garbage
                 collection for embedded systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "73--82",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806664",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We have developed an incremental compacting garbage
                 collector for embedded Java systems. The collector
                 divides the heap into equal sized pages and uses the
                 segregated free lists for fast allocation. Collectors
                 that have such a heap layout have a problem of
                 fragmentation in allocating objects larger than the
                 page size. We solve this problem by using the
                 replication-based incremental compaction. The compactor
                 evacuates all objects in one area, the evacuation area,
                 of the heap, thereby creating a large chunk of free
                 space. We developed an algorithm for choosing the
                 evacuation area that effectively cures fragmentation.
                 The compactor does not use any read-barriers. Instead,
                 it uses a technique similar to the replication-based
                 incremental copying collection. This needs forwarding
                 pointers for all evacuated objects. Rather than
                 introducing an extra field for each object, we use a
                 hash table to store forwarding pointers.\par

                 Evaluation of this garbage collector implemented in
                 Sun's J2ME Java Virtual Machine showed that all the
                 benchmarks used were able to run without memory
                 starvation using the heap sizes of only 151\%-286\% of
                 the maximum amount of live data plus 8 KB of the hash
                 table. Experiments on a desktop computer, though it is
                 not a platform for embedded systems, showed that the
                 maximum pause time was shorter than 200 &\#956;s, which
                 was comparable to that of our implementation of the
                 snapshot-at-the-beginning collector without compaction.
                 On an ARM processor, the runtime overhead was 1\%-16\%
                 with 8.0\% on average compared to the mark-sweep
                 collector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "compaction; embedded systems; fragmentation; garbage
                 collection; real-time garbage collection",
}

@Article{Hellyer:2010:LCW,
  author =       "Laurence Hellyer and Richard Jones and Antony L.
                 Hosking",
  title =        "The locality of concurrent write barriers",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "83--92",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806666",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent and incremental collectors require barriers
                 to ensure correct synchronisation between mutator and
                 collector. The overheads imposed by particular barriers
                 on particular systems have been widely studied.
                 Somewhat fewer studies have also compared barriers in
                 terms of their termination properties or the volume of
                 floating garbage they generate. Until now, the
                 consequences for locality of different barrier choices
                 has not been studied, although locality will be of
                 increasing importance for emerging architectures. This
                 paper provides a study of the locality of concurrent
                 write barriers, independent of the processor
                 architecture, virtual machine, compiler or garbage
                 collection algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "garbage collection; java; language implementation;
                 memory management",
}

@Article{Zhao:2010:EMS,
  author =       "Qin Zhao and Derek Bruening and Saman Amarasinghe",
  title =        "Efficient memory shadowing for 64-bit architectures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "93--102",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806667",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Shadow memory is used by dynamic program analysis
                 tools to store metadata for tracking properties of
                 application memory. The efficiency of mapping between
                 application memory and shadow memory has substantial
                 impact on the overall performance of such analysis
                 tools. However, traditional memory mapping schemes that
                 work well on 32-bit architectures cannot easily port to
                 64-bit architectures due to the much larger 64-bit
                 address space.\par

                 This paper presents EMS64, an efficient memory
                 shadowing scheme for 64-bit architectures. By taking
                 advantage of application reference locality and unused
                 regions in the 64-bit address space, EMS64 provides a
                 fast and flexible memory mapping scheme without relying
                 on any underlying platform features or requiring any
                 specific shadow memory size. Our experiments show that
                 EMS64 is able to reduce the runtime shadow memory
                 translation overhead to 81\% on average, which almost
                 halves the overhead of the fastest 64-bit shadow memory
                 system we are aware of.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "dynamic optimization; shadow memory",
}

@Article{Singer:2010:EGC,
  author =       "Jeremy Singer and Richard E. Jones and Gavin Brown and
                 Mikel Luj{\'a}n",
  title =        "The economics of garbage collection",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "103--112",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806669",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper argues that economic theory can improve our
                 understanding of memory management. We introduce the
                 {\em allocation curve}, as an analogue of the demand
                 curve from microeconomics. An allocation curve for a
                 program characterises how the amount of garbage
                 collection activity required during its execution
                 varies in relation to the heap size associated with
                 that program. The standard treatment of microeconomic
                 demand curves (shifts and elasticity) can be applied
                 directly and intuitively to our new allocation curves.
                 As an application of this new theory, we show how {\em
                 allocation elasticity\/} can be used to control the
                 heap growth rate for variable sized heaps in Jikes
                 RVM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "allocation curve; elasticity; garbage collection;
                 java; memory management; microeconomics",
}

@Article{Beg:2010:GTA,
  author =       "Mirza Beg and Peter van Beek",
  title =        "A graph theoretic approach to cache-conscious
                 placement of data for direct mapped caches",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "113--120",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806670",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Caches were designed to amortize the cost of memory
                 accesses by moving copies of frequently accessed data
                 closer to the processor. Over the years the increasing
                 gap between processor speed and memory access latency
                 has made the cache a bottleneck for program
                 performance. Enhancing cache performance has been
                 instrumental in speeding up programs. For this reason
                 several hardware and software techniques have been
                 proposed by researchers to optimize the cache for
                 minimizing the number of misses. Among these are
                 compile-time data placement techniques in memory which
                 improve cache performance. For the purpose of this
                 work, we concern ourselves with the problem of laying
                 out data in memory given the sequence of accesses on a
                 finite set of data objects such that cache-misses are
                 minimized. The problem has been shown to be hard to
                 solve optimally even if the sequence of data accesses
                 is known at compile time. In this paper we show that
                 given a direct-mapped cache, its size, and the data
                 access sequence, it is possible to identify the
                 instances where there are no conflict misses. We
                 describe an algorithm that can assign the data to cache
                 for minimal number of misses if there exists a way in
                 which conflict misses can be avoided altogether. We
                 also describe the implementation of a heuristic for
                 assigning data to cache for instances where the size of
                 the cache forces conflict misses. Experiments show that
                 our technique results in a 30\% reduction in the number
                 of cache misses compared to the original assignment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "cache consciousness; cache optimization; data
                 placement in cache; memory management; offline
                 algorithms",
}

@Article{Albert:2010:PIM,
  author =       "Elvira Albert and Samir Genaim and Miguel
                 G{\'o}mez-Zamalloa",
  title =        "Parametric inference of memory requirements for
                 garbage collected languages",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "8",
  pages =        "121--130",
  month =        aug,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1806651.1806671",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Oct 8 17:55:48 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The accurate prediction of program's memory
                 requirements is a critical component in software
                 development. Existing heap space analyses either do not
                 take deallocation into account or adopt specific models
                 of garbage collectors which do not necessarily
                 correspond to the actual memory usage. We present a
                 novel approach to inferring upper bounds on memory
                 requirements of Java-like programs which is {\em
                 parametric\/} on the notion of {\em object lifetime},
                 i.e., on when objects become collectible. If objects
                 lifetimes are inferred by a reachability analysis, then
                 our analysis infers accurate upper bounds on the memory
                 consumption for a {\em reachability\/} -based garbage
                 collector. Interestingly, if objects lifetimes are
                 inferred by a {\em heap liveness\/} analysis, then we
                 approximate the program minimal memory requirement,
                 i.e., the peak memory usage when using an optimal
                 garbage collector which frees objects as soon as they
                 become dead. The key idea is to integrate information
                 on objects lifetimes into the process of generating the
                 {\em recurrence equations\/} which capture the memory
                 usage at the different program states. If the heap size
                 limit is set to the memory requirement inferred by our
                 analysis, it is ensured that execution will not exceed
                 the memory limit with the only assumption that garbage
                 collection works when the limit is reached. Experiments
                 on Java bytecode programs provide evidence of the
                 feasibility and accuracy of our analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "garbage collection; java bytecode; live heap space
                 analysis; low-level languages; peak memory
                 consumption",
}

@Article{Gordon:2010:MMO,
  author =       "Michael J. C. Gordon",
  title =        "{ML}: metalanguage or object language?",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "1--2",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863545",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chapman:2010:GAL,
  author =       "James Chapman and Pierre-{\'E}variste Dagand and Conor
                 McBride and Peter Morris",
  title =        "The gentle art of levitation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "3--14",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863547",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Vytiniotis:2010:FPE,
  author =       "Dimitrios Vytiniotis and Andrew J. Kennedy",
  title =        "Functional pearl: every bit counts",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "15--26",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863548",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Buisson:2010:RES,
  author =       "J{\'e}r{\'e}my Buisson and Fabien Dagnat",
  title =        "{ReCaml}: execution state as the cornerstone of
                 reconfigurations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "27--38",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863550",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mazurak:2010:LCC,
  author =       "Karl Mazurak and Steve Zdancewic",
  title =        "{Lolliproc}: to concurrency from classical linear
                 logic via {Curry--Howard} and control",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "39--50",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863551",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{VanHorn:2010:AAM,
  author =       "David {Van Horn} and Matthew Might",
  title =        "Abstracting abstract machines",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "51--62",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863553",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Holdermans:2010:PFA,
  author =       "Stefan Holdermans and Jurriaan Hage",
  title =        "Polyvariant flow analysis with higher-ranked
                 polymorphic types and higher-order effect operators",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "63--74",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863554",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Naylor:2010:RR,
  author =       "Matthew Naylor and Colin Runciman",
  title =        "The {Reduceron} reconfigured",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "75--86",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863556",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The leading implementations of graph reduction all
                 target conventional processors designed for low-level
                 imperative execution. In this paper, we present a
                 processor specially designed to perform
                 graph-reduction. Our processor the Reduceron is
                 implemented using off-the-shelf reconfigurable
                 hardware. We highlight the low-level parallelism
                 present in sequential graph reduction, and show how
                 parallel memories and dynamic analyses are used in the
                 Reduceron to achieve an average reduction rate of 0.55
                 function applications per clock-cycle.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Scott:2010:UFP,
  author =       "David Scott and Richard Sharp and Thomas Gazagnaire
                 and Anil Madhavapeddy",
  title =        "Using functional programming within an industrial
                 product group: perspectives and perceptions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "87--92",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863557",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bergstrom:2010:LTS,
  author =       "Lars Bergstrom and Mike Rainey and John Reppy and Adam
                 Shaw and Matthew Fluet",
  title =        "Lazy tree splitting",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "93--104",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863558",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bierman:2010:SSS,
  author =       "Gavin M. Bierman and Andrew D. Gordon and Catalin
                 Hritcu and David Langworthy",
  title =        "Semantic subtyping with an {SMT} solver",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "105--116",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863560",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tobin-Hochstadt:2010:LTU,
  author =       "Sam Tobin-Hochstadt and Matthias Felleisen",
  title =        "Logical types for untyped languages",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "117--128",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863561",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Felleisen:2010:TC,
  author =       "Matthias Felleisen",
  title =        "{TeachScheme!}: a checkpoint",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "129--130",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863563",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Crary:2010:HOR,
  author =       "Karl Crary",
  title =        "Higher-order representation of substructural logics",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "131--142",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863565",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dreyer:2010:IHO,
  author =       "Derek Dreyer and Georg Neis and Lars Birkedal",
  title =        "The impact of higher-order state and control effects
                 on local relational reasoning",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "143--156",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863566",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Reed:2010:DMT,
  author =       "Jason Reed and Benjamin C. Pierce",
  title =        "Distance makes the types grow stronger: a calculus for
                 differential privacy",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "157--168",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863568",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Morgenstern:2010:STP,
  author =       "Jamie Morgenstern and Daniel R. Licata",
  title =        "Security-typed programming within dependently typed
                 programming",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "169--180",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Voigtlander:2010:CSS,
  author =       "Janis Voigtl{\"a}nder and Zhenjiang Hu and Kazutaka
                 Matsuda and Meng Wang",
  title =        "Combining syntactic and semantic
                 bidirectionalization",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "181--192",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863571",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Barbosa:2010:MLA,
  author =       "Davi M. J. Barbosa and Julien Cretin and Nate Foster
                 and Michael Greenberg and Benjamin C. Pierce",
  title =        "Matching lenses: alignment and view update",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "193--204",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863572",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hidaka:2010:BGT,
  author =       "Soichiro Hidaka and Zhenjiang Hu and Kazuhiro Inaba
                 and Hiroyuki Kato and Kazutaka Matsuda and Keisuke
                 Nakano",
  title =        "Bidirectionalizing graph transformations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "205--216",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863573",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pouillard:2010:FLP,
  author =       "Nicolas Pouillard and Fran{\c{c}}ois Pottier",
  title =        "A fresh look at programming with names and binders",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "217--228",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863575",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Crestani:2010:ERG,
  author =       "Marcus Crestani and Michael Sperber",
  title =        "Experience report: growing programming languages for
                 beginning students",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "229--234",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863576",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Culpepper:2010:FM,
  author =       "Ryan Culpepper and Matthias Felleisen",
  title =        "Fortifying macros",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "235--246",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863577",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Blelloch:2010:FPA,
  author =       "Guy E. Blelloch",
  title =        "Functional parallel algorithms",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "247--248",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863579",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Arnold:2010:SVS,
  author =       "Gilad Arnold and Johannes H{\"o}lzl and Ali Sinan
                 K{\"o}ksal and Rastislav Bod{\'\i}k and Mooly Sagiv",
  title =        "Specifying and verifying sparse matrix codes",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "249--260",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863581",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Keller:2010:RSP,
  author =       "Gabriele Keller and Manuel M. T. Chakravarty and Roman
                 Leshchinskiy and Simon Peyton Jones and Ben Lippmeier",
  title =        "Regular, shape-polymorphic, parallel arrays in
                 {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "261--272",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863582",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{McCreight:2010:CFC,
  author =       "Andrew McCreight and Tim Chevalier and Andrew
                 Tolmach",
  title =        "A certified framework for compiling and executing
                 garbage-collected languages",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "273--284",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863584",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Danielsson:2010:TPC,
  author =       "Nils Anders Danielsson",
  title =        "Total parser combinators",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "285--296",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863585",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Brady:2010:SYI,
  author =       "Edwin C. Brady and Kevin Hammond",
  title =        "Scrapping your inefficient engine: using partial
                 evaluation to improve domain-specific language
                 implementation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "297--308",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863587",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mitchell:2010:RS,
  author =       "Neil Mitchell",
  title =        "Rethinking supercompilation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "309--320",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863588",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chargueraud:2010:PVT,
  author =       "Arthur Chargu{\'e}raud",
  title =        "Program verification through characteristic formulae",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "321--332",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863590",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Stampoulis:2010:VTC,
  author =       "Antonis Stampoulis and Zhong Shao",
  title =        "{VeriML}: typed computation of logical terms inside a
                 language with effects",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "333--344",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863591",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bernardy:2010:PDT,
  author =       "Jean-Philippe Bernardy and Patrik Jansson and Ross
                 Paterson",
  title =        "Parametricity and dependent types",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "345--356",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863592",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fischer:2010:PRE,
  author =       "Sebastian Fischer and Frank Huch and Thomas Wilke",
  title =        "A play on regular expressions: functional pearl",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "357--368",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863594",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pop:2010:ERH,
  author =       "Iustin Pop",
  title =        "Experience report: {Haskell} as a reagent: results and
                 observations on the use of {Haskell} in a {Python}
                 project",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "369--374",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863595",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Morris:2010:ICT,
  author =       "J. Garrett Morris and Mark P. Jones",
  title =        "Instance chains: type class programming without
                 overlapping instances",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "9",
  pages =        "375--386",
  month =        sep,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932681.1863596",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:43 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Forrest:2010:CES,
  author =       "Stephanie Forrest",
  title =        "The case for evolvable software",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "1--1",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pierce:2010:ASF,
  author =       "Benjamin C. Pierce",
  title =        "Art, science, and fear",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "2--2",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869540",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Syme:2010:FTS,
  author =       "Don Syme",
  title =        "{F\#}: Taking Succinct, Efficient, Typed Functional
                 Programming into the Mainstream",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "3--3",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1921682",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Stanley:2010:AOH,
  author =       "Kenneth O. Stanley",
  title =        "To achieve our highest goals, we must be willing to
                 abandon them",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "3--3",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869541",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Roberson:2010:EMG,
  author =       "Michael Roberson and Chandrasekhar Boyapati",
  title =        "Efficient modular glass box software model checking",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "4--21",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869461",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hanenberg:2010:EAS,
  author =       "Stefan Hanenberg",
  title =        "An experiment about static and dynamic type systems:
                 doubts about the positive impact of static type systems
                 on development time",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "22--35",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869462",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Itzhaky:2010:SIS,
  author =       "Shachar Itzhaky and Sumit Gulwani and Neil Immerman
                 and Mooly Sagiv",
  title =        "A simple inductive synthesis methodology and its
                 applications",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "36--46",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869463",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mercadal:2010:DSA,
  author =       "Julien Mercadal and Quentin Enard and Charles Consel
                 and Nicolas Loriant",
  title =        "A domain-specific approach to architecturing error
                 handling in pervasive computing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "47--61",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869465",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2010:GFR,
  author =       "Wei Li and Charles Zhang and Songlin Hu",
  title =        "{G-Finder}: routing programming questions closer to
                 the experts",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "62--73",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869466",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hoda:2010:AC,
  author =       "Rashina Hoda and Philippe Kruchten and James Noble and
                 Stuart Marshall",
  title =        "Agility in context",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "74--88",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869467",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Auerbach:2010:LJC,
  author =       "Joshua Auerbach and David F. Bacon and Perry Cheng and
                 Rodric Rabbah",
  title =        "{Lime}: a {Java}-compatible and synthesizable language
                 for heterogeneous architectures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "89--108",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869469",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kou:2010:OFF,
  author =       "Stephen Kou and Jens Palsberg",
  title =        "From {OO} to {FPGA}: fitting round objects into square
                 hardware?",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "109--124",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869470",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tian:2010:ICP,
  author =       "Kai Tian and Yunlian Jiang and Eddy Z. Zhang and
                 Xipeng Shen",
  title =        "An input-centric paradigm for program dynamic
                 optimizations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "125--139",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869471",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wood:2010:CSS,
  author =       "Benjamin P. Wood and Adrian Sampson and Luis Ceze and
                 Dan Grossman",
  title =        "Composable specifications for structured shared-memory
                 communication",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "140--159",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869473",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shi:2010:DUW,
  author =       "Yao Shi and Soyeon Park and Zuoning Yin and Shan Lu
                 and Yuanyuan Zhou and Wenguang Chen and Weimin Zheng",
  title =        "Do {I} use the wrong definition?: {DeFuse}:
                 definition-use invariants for detecting concurrency and
                 sequential bugs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "160--174",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869474",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gabel:2010:SSD,
  author =       "Mark Gabel and Junfeng Yang and Yuan Yu and Moises
                 Goldszmidt and Zhendong Su",
  title =        "Scalable and systematic detection of buggy
                 inconsistencies in source code",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "175--190",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869475",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ogata:2010:SJN,
  author =       "Kazunori Ogata and Dai Mikurube and Kiyokuni Kawachiya
                 and Scott Trent and Tamiya Onodera",
  title =        "A study of {Java}'s non-{Java} memory",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "191--204",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869477",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{McIlroy:2010:HJR,
  author =       "Ross McIlroy and Joe Sventek",
  title =        "{Hera-JVM}: a runtime system for heterogeneous
                 multi-core architectures",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "205--222",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869478",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wegiel:2010:CLT,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "Cross-language, type-safe, and transparent object
                 sharing for co-located managed runtimes",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "223--240",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869479",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jin:2010:ISS,
  author =       "Guoliang Jin and Aditya Thakur and Ben Liblit and Shan
                 Lu",
  title =        "Instrumentation and sampling strategies for
                 cooperative concurrency bug isolation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "241--255",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869481",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Reichenbach:2010:WCG,
  author =       "Christoph Reichenbach and Neil Immerman and Yannis
                 Smaragdakis and Edward E. Aftandilian and Samuel
                 Z. Guyer",
  title =        "What can the {GC} compute efficiently?: a language for
                 heap assertions at {GC} time",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "256--269",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869482",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Purandare:2010:MOS,
  author =       "Rahul Purandare and Matthew B. Dwyer and Sebastian
                 Elbaum",
  title =        "Monitor optimization via stutter-equivalent loop
                 transformation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "270--285",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869483",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Schaefer:2010:SIR,
  author =       "Max Schaefer and Oege de Moor",
  title =        "Specifying and implementing refactorings",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "286--301",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869485",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nguyen:2010:GBA,
  author =       "Hoan Anh Nguyen and Tung Thanh Nguyen and Gary
                 {Wilson, Jr.} and Anh Tuan Nguyen and Miryung Kim and
                 Tien N. Nguyen",
  title =        "A graph-based approach to {API} usage adaptation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "302--321",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869486",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kell:2010:CAA,
  author =       "Stephen Kell",
  title =        "Component adaptation and assembly using interface
                 relations",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "322--340",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869487",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Oliveira:2010:TCO,
  author =       "Bruno C. d. S. Oliveira and Adriaan Moors and Martin
                 Odersky",
  title =        "Type classes as objects and implicits",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "341--360",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869489",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lerner:2010:SDT,
  author =       "Benjamin S. Lerner and Herman Venter and Dan
                 Grossman",
  title =        "Supporting dynamic, third-party code customizations in
                 {JavaScript} using aspects",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "361--376",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869490",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Herzeel:2010:DPR,
  author =       "Charlotte Herzeel and Pascal Costanza",
  title =        "Dynamic parallelization of recursive code: part 1:
                 managing control flow interactions with the
                 continuator",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "377--396",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869491",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dillig:2010:SHA,
  author =       "Isil Dillig and Thomas Dillig and Alex Aiken",
  title =        "Symbolic heap abstraction with demand-driven
                 axiomatization of memory invariants",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "397--410",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869493",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liang:2010:DEP,
  author =       "Percy Liang and Omer Tripp and Mayur Naik and Mooly
                 Sagiv",
  title =        "A dynamic evaluation of the precision of static heap
                 abstractions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "411--427",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869494",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mendez-Lojo:2010:PIB,
  author =       "Mario M{\'e}ndez-Lojo and Augustine Mathew and Keshav
                 Pingali",
  title =        "Parallel inclusion-based points-to analysis",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "428--443",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869495",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kats:2010:SLW,
  author =       "Lennart C. L. Kats and Eelco Visser",
  title =        "The {Spoofax} language workbench: rules for
                 declarative specification of languages and {IDEs}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "444--463",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869497",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Servetto:2010:MMC,
  author =       "Marco Servetto and Elena Zucca",
  title =        "{MetaFJig}: a meta-circular composition language for
                 {Java}-like classes",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "464--483",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869498",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Klose:2010:MLM,
  author =       "Karl Klose and Klaus Ostermann",
  title =        "Modular logic metaprogramming",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "484--503",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869499",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{vanStaden:2010:RAM,
  author =       "Stephan van Staden and Cristiano Calcagno",
  title =        "Reasoning about multiple related abstractions with
                 {MultiStar}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "504--519",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869501",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Qi:2010:HFS,
  author =       "Xin Qi and Andrew C. Myers",
  title =        "Homogeneous family sharing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "520--538",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869502",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chiba:2010:MMC,
  author =       "Shigeru Chiba and Atsushi Igarashi and Salikh
                 Zakirov",
  title =        "Mostly modular compilation of crosscutting concerns by
                 contextual predicate dispatch",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "539--554",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869503",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Klein:2010:RTH,
  author =       "Casey Klein and Matthew Flatt and Robert Bruce
                 Findler",
  title =        "Random testing for higher-order, stateful programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "555--566",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869505",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{McCarthy:2010:TSS,
  author =       "Jay A. McCarthy",
  title =        "The two-state solution: native and serializable
                 continuations accord",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "567--582",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869506",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Swaine:2010:BFI,
  author =       "James Swaine and Kevin Tew and Peter Dinda and Robert
                 Bruce Findler and Matthew Flatt",
  title =        "Back to the futures: incremental parallelization of
                 existing sequential runtime systems",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "583--597",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869507",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zibin:2010:OIG,
  author =       "Yoav Zibin and Alex Potanin and Paley Li and Mahmood
                 Ali and Michael D. Ernst",
  title =        "Ownership and immutability in generic {Java}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "598--617",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869509",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cameron:2010:TO,
  author =       "Nicholas Cameron and James Noble and Tobias
                 Wrigstad",
  title =        "Tribal ownership",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "618--633",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Matsakis:2010:TAT,
  author =       "Nicholas D. Matsakis and Thomas R. Gross",
  title =        "A time-aware type system for data-race protection and
                 guaranteed initialization",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "634--651",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869511",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Upadhyaya:2010:AAR,
  author =       "Gautam Upadhyaya and Samuel P. Midkiff and Vijay S.
                 Pai",
  title =        "Automatic atomic region identification in shared
                 memory {SPMD} programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "652--670",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869513",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kulkarni:2010:TTP,
  author =       "Aditya Kulkarni and Yu David Liu and Scott F. Smith",
  title =        "Task types for pervasive atomicity",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "671--690",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869514",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Burckhardt:2010:CPR,
  author =       "Sebastian Burckhardt and Alexandro Baldassin and Daan
                 Leijen",
  title =        "Concurrent programming with revisions and isolation
                 types",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "691--707",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869515",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bebenita:2010:STB,
  author =       "Michael Bebenita and Florian Brandner and Manuel
                 Fahndrich and Francesco Logozzo and Wolfram Schulte and
                 Nikolai Tillmann and Herman Venter",
  title =        "{SPUR}: a trace-based {JIT} compiler for {CIL}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "708--725",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869517",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kapur:2010:RRL,
  author =       "Puneet Kapur and Brad Cossette and Robert J. Walker",
  title =        "Refactoring references for library migration",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "726--738",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Altman:2010:PAI,
  author =       "Erik Altman and Matthew Arnold and Stephen Fink and
                 Nick Mitchell",
  title =        "Performance analysis of idle programs",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "739--753",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869519",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Davis:2010:RBL,
  author =       "Samuel Davis and Gregor Kiczales",
  title =        "Registration-based language abstractions",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "754--773",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869521",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Verwaest:2010:PBR,
  author =       "Toon Verwaest and Camillo Bruni and David Gurtner and
                 Adrian Lienhard and Oscar Niestrasz",
  title =        "{Pinocchio}: bringing reflection to life with
                 first-class interpreters",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "774--789",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Rajan:2010:CMD,
  author =       "Hridesh Rajan and Steven M. Kautz and Wayne
                 Rowcliffe",
  title =        "Concurrency by modularity: design patterns, a case in
                 point",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "790--805",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869523",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Rinard:2010:PSA,
  author =       "Martin Rinard and Henry Hoffmann and Sasa Misailovic
                 and Stelios Sidiroglou",
  title =        "Patterns and statistical analysis for understanding
                 reduced resource computing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "806--821",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869525",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sorensen:2010:PTC,
  author =       "Andrew Sorensen and Henry Gardner",
  title =        "Programming with time: cyber-physical programming with
                 {impromptu}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "822--834",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869526",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chafi:2010:LVH,
  author =       "Hassan Chafi and Zach DeVito and Adriaan Moors and
                 Tiark Rompf and Arvind K. Sujeeth and Pat Hanrahan and
                 Martin Odersky and Kunle Olukotun",
  title =        "Language virtualization for heterogeneous parallel
                 computing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "835--847",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869527",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ossher:2010:FMT,
  author =       "Harold Ossher and Rachel Bellamy and Ian Simmonds and
                 David Amid and Ateret Anaby-Tavor and Matthew Callery
                 and Michael Desmond and Jacqueline de Vries and Amit
                 Fisher and Sophia Krasikov",
  title =        "Flexible modeling tools for pre-requirements analysis:
                 conceptual architecture and research challenges",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "848--864",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dumitras:2010:UUI,
  author =       "Tudor Dumitras and Priya Narasimhan and Eli
                 Tilevich",
  title =        "To upgrade or not to upgrade: impact of online
                 upgrades across multiple administrative domains",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "865--876",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869530",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Arnold:2010:MAP,
  author =       "Kenneth C. Arnold and Henry Lieberman",
  title =        "Managing ambiguity in programming by finding
                 unambiguous examples",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "877--884",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gabriel:2010:BST,
  author =       "Richard P. Gabriel and Kevin J. Sullivan",
  title =        "Better science through art",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "885--900",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869533",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Quillien:2010:RDN,
  author =       "Jenny Quillien and Dave West",
  title =        "Rubber ducks, nightmares, and unsaturated predicates:
                 proto-scientific schemata are good for agile",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "901--917",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kats:2010:PDS,
  author =       "Lennart C. L. Kats and Eelco Visser and Guido
                 Wachsmuth",
  title =        "Pure and declarative syntax definition: paradise lost
                 and regained",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "918--932",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869535",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hanenberg:2010:FHL,
  author =       "Stefan Hanenberg",
  title =        "Faith, hope, and love: an essay on software science's
                 neglect of human factors",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "933--946",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Adamczyk:2010:TBD,
  author =       "Paul Adamczyk and Munawar Hafiz",
  title =        "The {Tower of Babel} did not fail",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "10",
  pages =        "947--957",
  month =        oct,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1932682.1869537",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:13:46 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Rendel:2010:ISD,
  author =       "Tillmann Rendel and Klaus Ostermann",
  title =        "Invertible syntax descriptions: unifying parsing and
                 pretty printing",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "1--12",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863525",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Straka:2010:PHC,
  author =       "Milan Straka",
  title =        "The performance of the {Haskell} containers package",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "13--24",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863526",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Pirog:2010:SDS,
  author =       "Maciej Pirog and Dariusz Biernacki",
  title =        "A systematic derivation of the {STG} machine verified
                 in {Coq}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "25--36",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863528",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Magalhaes:2010:GDM,
  author =       "Jos{\'e} Pedro Magalh{\~a}es and Atze Dijkstra and
                 Johan Jeuring and Andres L{\"o}h",
  title =        "A generic deriving mechanism for {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "37--48",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{vanGroningen:2010:ESB,
  author =       "John van Groningen and Thomas van Noort and Peter
                 Achten and Pieter Koopman and Rinus Plasmeijer",
  title =        "Exchanging sources between {Clean} and {Haskell}: a
                 double-edged front end for the {Clean} compiler",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "49--60",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863530",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The functional programming languages Clean and Haskell
                 have been around for over two decades. Over time, both
                 languages have developed a large body of useful
                 libraries and come with interesting language features.
                 It is our primary goal to benefit from each other's
                 evolutionary results by facilitating the exchange of
                 sources between Clean and Haskell and study the
                 forthcoming interactions between their distinct
                 languages features. This is achieved by using the
                 existing Clean compiler as starting point, and
                 implementing a double-edged front end for this
                 compiler: it supports both standard Clean 2.1 and
                 (currently a large part of) standard Haskell 98.
                 Moreover, it allows both languages to seamlessly use
                 many of each other's language features that were alien
                 to each other before. For instance, Haskell can now use
                 uniqueness typing anywhere, and Clean can use newtypes
                 efficiently. This has given birth to two new dialects
                 of Clean and Haskell, dubbed Clean* and Haskell*.
                 Additionally, measurements of the performance of the
                 new compiler indicate that it is on par with the
                 flagship Haskell compiler GHC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Morris:2010:ERU,
  author =       "J. Garrett Morris",
  title =        "Experience report: using hackage to inform language
                 design",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "61--66",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Mainland:2010:NEC,
  author =       "Geoffrey Mainland and Greg Morrisett",
  title =        "{Nikola}: embedding compiled {GPU} functions in
                 {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "67--78",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863533",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Launchbury:2010:COH,
  author =       "John Launchbury and Trevor Elliott",
  title =        "Concurrent orchestration in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "79--90",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Marlow:2010:SNM,
  author =       "Simon Marlow and Patrick Maier and Hans-Wolfgang Loidl
                 and Mustafa K. Aswad and Phil Trinder",
  title =        "Seq no more: better strategies for parallel
                 {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "91--102",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863535",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{OSullivan:2010:SEH,
  author =       "Bryan O'Sullivan and Johan Tibell",
  title =        "Scalable {I/O} event handling for {GHC}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "103--108",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Terei:2010:LBG,
  author =       "David A. Terei and Manuel M. T. Chakravarty",
  title =        "An {{\tt llvm}} backend for {GHC}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "109--120",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863538",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Ramsey:2010:HMR,
  author =       "Norman Ramsey and Jo{\~a}o Dias and Simon Peyton
                 Jones",
  title =        "{Hoopl}: a modular, reusable library for dataflow
                 analysis and transformation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "121--134",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Bolingbroke:2010:SE,
  author =       "Maximilian Bolingbroke and Simon Peyton Jones",
  title =        "Supercompilation by evaluation",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "135--146",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863540",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Yorgey:2010:SFT,
  author =       "Brent A. Yorgey",
  title =        "Species and functors and types, oh my!",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "11",
  pages =        "147--158",
  month =        nov,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2088456.1863542",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:45 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "HASKELL '10 conference proceedings.",
}

@Article{Brunthaler:2010:EIU,
  author =       "Stefan Brunthaler",
  title =        "Efficient interpretation using quickening",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "12",
  pages =        "1--14",
  month =        dec,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1899661.1869633",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Dec 15 10:25:15 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Just-in-time compilers offer the biggest achievable
                 payoff performance-wise, but their implementation is a
                 non-trivial, time-consuming task affecting the
                 interpreter's maintenance for years to come, too.
                 Recent research addresses this issue by providing ways
                 of leveraging existing just-in-time compilation
                 infrastructures. Though there has been considerable
                 research on improving the efficiency of just-in-time
                 compilers, the area of optimizing interpreters has
                 gotten less attention as if the implementation of a
                 dynamic translation system was the ``ultima ratio'' for
                 efficiently interpreting programming languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zakirov:2010:ODD,
  author =       "Salikh S. Zakirov and Shigeru Chiba and Etsuya
                 Shibayama",
  title =        "Optimizing dynamic dispatch with fine-grained state
                 tracking",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "12",
  pages =        "15--26",
  month =        dec,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1899661.1869634",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Dec 15 10:25:15 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic mixin is a construct available in Ruby and
                 other dynamic languages. It can be used as a base to
                 implement a range of programming paradigms, such as
                 dynamic aspect-oriented programming and
                 context-oriented programming. However, the performance
                 characteristics of current implementation of dynamic
                 mixin in Ruby leaves much to be desired under condition
                 of frequent dynamic mixin operations, global method
                 cache and inline cache misses incur significant
                 overhead. In this work we implemented fine-grained
                 state tracking for CRuby 1. and were able to improve
                 performance by more than six times on the
                 microbenchmark exercising extreme case flowing 4 times
                 to global method cache clearing, 28\% to fine-grained
                 state tracking and further 12\% to inline cache miss
                 elimination by caching alternating states.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gorbovitski:2010:AAO,
  author =       "Michael Gorbovitski and Yanhong A. Liu and Scott D.
                 Stoller and Tom Rothamel and Tuncay K. Tekle",
  title =        "Alias analysis for optimization of dynamic languages",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "12",
  pages =        "27--42",
  month =        dec,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1899661.1869635",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Dec 15 10:25:15 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic languages such as Python allow programs to be
                 written more easily using high-level constructs such as
                 comprehensions for queries and using generic code.
                 Efficient execution of programs then requires powerful
                 optimizations - incrementalization of expensive queries
                 and specialization of generic code. Effective
                 incrementalization and specialization of dynamic
                 languages require precise and scalable alias analysis.
                 This paper describes the development and experimental
                 evaluation of a may-alias analysis for a full dynamic
                 object-oriented language, for program optimization by
                 incrementalization and specialization. The analysis is
                 flow-sensitive; we show that this is necessary for
                 effective optimization of dynamic languages. It uses
                 precise type analysis and a powerful form of context
                 sensitivity, called trace sensitivity, to further
                 improve analysis precision.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pestov:2010:FDS,
  author =       "Sviatoslav Pestov and Daniel Ehrenberg and Joe
                 Groff",
  title =        "{Factor}: a dynamic stack-based programming language",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "12",
  pages =        "43--58",
  month =        dec,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1899661.1869637",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Dec 15 10:25:15 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Factor is a new dynamic object-oriented programming
                 language. It began as an embedded scripting language
                 and evolved to a mature application development
                 language. The language has a simple execution model and
                 is based on the manipulation of data on a stack. An
                 advanced metaprogramming system provides means for
                 easily extending the language. Thus, Factor allows
                 programmers to use the right features for their problem
                 domain. The Factor implementation is self-hosting,
                 featuring an interactive development environment and an
                 optimizing compiler. In this paper, the language and
                 its implementation are presented.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{VanCutsem:2010:PDP,
  author =       "Tom {Van Cutsem} and Mark S. Miller",
  title =        "Proxies: design principles for robust object-oriented
                 intercession {APIs}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "12",
  pages =        "59--72",
  month =        dec,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1899661.1869638",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Dec 15 10:25:15 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Proxies are a powerful approach to implement
                 meta-objects in object-oriented languages without
                 having to resort to metacircular interpretation. We
                 introduce such a meta-level API based on proxies for
                 Javascript. We simultaneously introduce a set of design
                 principles that characterize such APIs in general, and
                 compare similar APIs of other languages in terms of
                 these principles. We highlight how principled
                 proxy-based APIs improve code robustness by avoiding
                 interference between base and meta-level code that
                 occur in more common reflective intercession
                 mechanisms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tratt:2010:EIL,
  author =       "Laurence Tratt",
  title =        "Experiences with an {Icon3}-like expression evaluation
                 system",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "12",
  pages =        "73--80",
  month =        dec,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1899661.1869640",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Dec 15 10:25:15 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The design of the Icon programming language's
                 expression evaluation system, which can perform limited
                 backtracking, was unique amongst imperative programming
                 languages when created. In this paper I explain and
                 critique the original Icon design and show how a
                 similar system can be integrated into a modern
                 dynamically typed language. Finally I detail my
                 experiences of this system and offer suggestions for
                 the lessons to be learned from it.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Axelsen:2010:CDM,
  author =       "Eyvind W. Axelsen and Stein Krogdahl and Birger
                 M{\o}ller-Pedersen",
  title =        "Controlling dynamic module composition through an
                 extensible meta-level {API}",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "12",
  pages =        "81--96",
  month =        dec,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1899661.1869641",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Dec 15 10:25:15 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In addition to traditional object-oriented (OO)
                 concepts such as inheritance and polymorphism, several
                 modularization and composition mechanisms like e.g.
                 traits, mixins and virtual classes have emerged. The
                 Package Template mechanism is another attempt at
                 providing a flexible mechanism for modularization,
                 composition and adaption. Dynamic languages have
                 traditionally employed strong support for
                 meta-programming, with hooks to control OO concepts
                 such as method invocation and object construction, by
                 utilizing meta-classes and meta-object protocols. In
                 this work, we attempt to bring a corresponding degree
                 of meta-level control to composition primitives, with a
                 concrete starting point in the package template
                 mechanism as developed for the dynamic language
                 Groovy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Strickland:2010:CFC,
  author =       "T. Stephen Strickland and Matthias Felleisen",
  title =        "Contracts for first-class classes",
  journal =      j-SIGPLAN,
  volume =       "45",
  number =       "12",
  pages =        "97--112",
  month =        dec,
  year =         "2010",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1899661.1869642",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Dec 15 10:25:15 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "First-class classes add expressive power to
                 class-based object-oriented languages. Most
                 importantly, programmers can abstract over common
                 scenarios with first-class classes. When it comes to
                 behavioral software contracts, however, first-class
                 classes pose significant challenges. In this paper, we
                 present the first contract system for a programming
                 language with first-class classes. The design has been
                 implemented for Racket, which supports first-class
                 classes and which implements mixins and traits as
                 syntactic sugar. We expect that our experience also
                 applies to languages with native mixins and/or
                 traits.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Leroy:2011:VSD,
  author =       "Xavier Leroy",
  title =        "Verified squared: does critical software deserve
                 verified tools?",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "1--2",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926387",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lhotak:2011:PAE,
  author =       "Ondrej Lhot{\'a}k and Kwok-Chiang Andrew Chung",
  title =        "Points-to analysis with efficient strong updates",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "3--16",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926389",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Smaragdakis:2011:PYC,
  author =       "Yannis Smaragdakis and Martin Bravenboer and Ondrej
                 Lhot{\'a}k",
  title =        "Pick your contexts well: understanding
                 object-sensitivity",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "17--30",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926390",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liang:2011:LMA,
  author =       "Percy Liang and Omer Tripp and Mayur Naik",
  title =        "Learning minimal abstractions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "31--42",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926391",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sevcik:2011:RMC,
  author =       "Jaroslav {\v{S}}ev{\c{c}}ik and Viktor Vafeiadis and
                 Francesco Zappa Nardelli and Suresh Jagannathan and
                 Peter Sewell",
  title =        "Relaxed-memory concurrency and verified compilation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "43--54",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926393",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Batty:2011:MCC,
  author =       "Mark Batty and Scott Owens and Susmit Sarkar and Peter
                 Sewell and Tjark Weber",
  title =        "Mathematizing {C++} concurrency",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "55--66",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926394",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ramananandro:2011:FVO,
  author =       "Tahina Ramananandro and Gabriel {Dos Reis} and Xavier
                 Leroy",
  title =        "Formal verification of object layout for {C++}
                 multiple inheritance",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "67--80",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926395",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Choi:2011:SAM,
  author =       "Wontae Choi and Baris Aktemur and Kwangkeun Yi and
                 Makoto Tatsuta",
  title =        "Static analysis of multi-staged programs via unstaging
                 translation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "81--92",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926397",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Schwarz:2011:SAI,
  author =       "Martin D. Schwarz and Helmut Seidl and Vesal Vojdani
                 and Peter Lammich and Markus M{\"u}ller-Olm",
  title =        "Static analysis of interrupt-driven programs
                 synchronized via the priority ceiling protocol",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "93--104",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926398",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cousot:2011:PSF,
  author =       "Patrick Cousot and Radhia Cousot and Francesco
                 Logozzo",
  title =        "A parametric segmentation functor for fully automatic
                 and scalable array content analysis",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "105--118",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926399",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Birkedal:2011:SIK,
  author =       "Lars Birkedal and Bernhard Reus and Jan Schwinghammer
                 and Kristian St{\o}vring and Jacob Thamsborg and
                 Hongseok Yang",
  title =        "Step-indexed {Kripke} models over recursive worlds",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "119--132",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926401",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hur:2011:KLR,
  author =       "Chung-Kil Hur and Derek Dreyer",
  title =        "A {Kripke} logical relation between {ML} and
                 assembly",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "133--146",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926402",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pottier:2011:TSP,
  author =       "Fran{\c{c}}ois Pottier",
  title =        "A typed store-passing translation for general
                 references",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "147--158",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926403",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Prountzos:2011:SAO,
  author =       "Dimitrios Prountzos and Roman Manevich and Keshav
                 Pingali and Kathryn S. McKinley",
  title =        "A shape analysis for optimizing parallel graph
                 programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "159--172",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926405",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Rival:2011:CCA,
  author =       "Xavier Rival and Bor-Yuh Evan Chang",
  title =        "Calling context abstraction with shapes",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "173--186",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926406",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dillig:2011:PRP,
  author =       "Isil Dillig and Thomas Dillig and Alex Aiken",
  title =        "Precise reasoning for programs using containers",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "187--200",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926407",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ahmed:2011:BA,
  author =       "Amal Ahmed and Robert Bruce Findler and Jeremy G. Siek
                 and Philip Wadler",
  title =        "Blame for all",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "201--214",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926409",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dimoulas:2011:CBC,
  author =       "Christos Dimoulas and Robert Bruce Findler and Cormac
                 Flanagan and Matthias Felleisen",
  title =        "Correct blame for contracts: no more scapegoating",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "215--226",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926410",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Weirich:2011:GTA,
  author =       "Stephanie Weirich and Dimitrios Vytiniotis and Simon
                 Peyton Jones and Steve Zdancewic",
  title =        "Generative type abstraction and type-level
                 computation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "227--240",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926411",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{MacLaurin:2011:DKT,
  author =       "Matthew B. MacLaurin",
  title =        "The design of {Kodu}: a tiny visual programming
                 language for children on the {Xbox 360}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "241--246",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926413",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Turon:2011:SLR,
  author =       "Aaron Joseph Turon and Mitchell Wand",
  title =        "A separation logic for refining concurrent objects",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "247--258",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926415",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dodds:2011:MRD,
  author =       "Mike Dodds and Suresh Jagannathan and Matthew J.
                 Parkinson",
  title =        "Modular reasoning for deterministic parallelism",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "259--270",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926416",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jacobs:2011:EMF,
  author =       "Bart Jacobs and Frank Piessens",
  title =        "Expressive modular fine-grained concurrency
                 specification",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "271--282",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926417",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Madhusudan:2011:TWA,
  author =       "P. Madhusudan and Gennaro Parlato",
  title =        "The tree width of auxiliary storage",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "283--294",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926419",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tzevelekos:2011:FRA,
  author =       "Nikos Tzevelekos",
  title =        "Fresh-register automata",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "295--306",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926420",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Leroux:2011:VAS,
  author =       "J{\'e}r{\^o}me Leroux",
  title =        "Vector addition system reachability problem: a short
                 self-contained proof",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "307--316",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926421",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gulwani:2011:ASP,
  author =       "Sumit Gulwani",
  title =        "Automating string processing in spreadsheets using
                 input-output examples",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "317--330",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926423",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gupta:2011:PAR,
  author =       "Ashutosh Gupta and Corneliu Popeea and Andrey
                 Rybalchenko",
  title =        "Predicate abstraction and refinement for verifying
                 multi-threaded programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "331--344",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926424",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ghica:2011:GSIa,
  author =       "Dan R. Ghica and Alex Smith",
  title =        "Geometry of synthesis {III}: resource management
                 through type inference",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "345--356",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926425",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hoffmann:2011:MAR,
  author =       "Jan Hoffmann and Klaus Aehlig and Martin Hofmann",
  title =        "Multivariate amortized resource analysis",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "357--370",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926427",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hofmann:2011:SL,
  author =       "Martin Hofmann and Benjamin Pierce and Daniel Wagner",
  title =        "Symmetric lenses",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "371--384",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926428",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Henglein:2011:REC,
  author =       "Fritz Henglein and Lasse Nielsen",
  title =        "Regular expression containment: coinductive
                 axiomatization and computational interpretation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "385--398",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926429",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cook:2011:MPD,
  author =       "Byron Cook and Eric Koskinen",
  title =        "Making prophecies with decision predicates",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "399--410",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926431",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Emmi:2011:DBS,
  author =       "Michael Emmi and Shaz Qadeer and Zvonimir
                 Rakamari{\'c}",
  title =        "Delay-bounded scheduling",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "411--422",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926432",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sinha:2011:IA,
  author =       "Nishant Sinha and Chao Wang",
  title =        "On interference abstractions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "423--434",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926433",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Denielou:2011:DMS,
  author =       "Pierre-Malo Deni{\'e}lou and Nobuko Yoshida",
  title =        "Dynamic multirole session types",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "435--446",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926435",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tov:2011:PAT,
  author =       "Jesse A. Tov and Riccardo Pucella",
  title =        "Practical affine types",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "447--458",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926436",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{An:2011:DIS,
  author =       "Jong-hoon (David) An and Avik Chaudhuri and Jeffrey S.
                 Foster and Michael Hicks",
  title =        "Dynamic inference of static types for {\tt ruby}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "459--472",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926437",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gordon:2011:RMV,
  author =       "Andrew D. Gordon and Robert Harper and John Harrison
                 and Alan Jeffrey and Peter Sewell",
  title =        "{Robin Milner 1934--2010}: verification, languages,
                 and concurrency",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "473--474",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926439",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bendersky:2011:SOB,
  author =       "Anna Bendersky and Erez Petrank",
  title =        "Space overhead bounds for dynamic memory management
                 with partial compaction",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "475--486",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926441",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Attiya:2011:LOE,
  author =       "Hagit Attiya and Rachid Guerraoui and Danny Hendler
                 and Petr Kuznetsov and Maged M. Michael and Martin
                 Vechev",
  title =        "Laws of order: expensive synchronization in concurrent
                 algorithms cannot be eliminated",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "487--498",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926442",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Esparza:2011:CPB,
  author =       "Javier Esparza and Pierre Ganty",
  title =        "Complexity of pattern-based verification for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "499--510",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926443",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Prabhu:2011:EAF,
  author =       "Tarun Prabhu and Shreyas Ramalingam and Matthew Might
                 and Mary Hall",
  title =        "{EigenCFA}: accelerating flow analysis with {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "511--522",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926445",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Feng:2011:BQP,
  author =       "Yuan Feng and Runyao Duan and Mingsheng Ying",
  title =        "Bisimulation for quantum processes",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "523--534",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926446",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bocchino:2011:SND,
  author =       "Robert L. {Bocchino, Jr.} and Stephen Heumann and Nima
                 Honarmand and Sarita V. Adve and Vikram S. Adve and
                 Adam Welc and Tatiana Shpeisman",
  title =        "Safe nondeterminism in a deterministic-by-default
                 parallel language",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "535--548",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926447",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pouchet:2011:LTC,
  author =       "Louis-No{\"e}l Pouchet and Uday Bondhugula and
                 C{\'e}dric Bastoul and Albert Cohen and J. Ramanujam
                 and P. Sadayappan and Nicolas Vasilache",
  title =        "Loop transformations: convexity, pruning and
                 optimization",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "549--562",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926449",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Guo:2011:ECT,
  author =       "Shu-yu Guo and Jens Palsberg",
  title =        "The essence of compiling with traces",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "563--574",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926450",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ramsey:2011:RRM,
  author =       "Norman Ramsey and Jo{\~a}o Dias",
  title =        "Resourceable, retargetable, modular instruction
                 selection using a machine-independent, type-based
                 tiling of low-level intermediate code",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "575--586",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926451",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ong:2011:VHO,
  author =       "C.-H. Luke Ong and Steven James Ramsay",
  title =        "Verifying higher-order functional programs with
                 pattern-matching algebraic data types",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "587--598",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926453",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Alur:2011:STA,
  author =       "Rajeev Alur and Pavol Cern{\'y}",
  title =        "Streaming transducers for algorithmic verification of
                 single-pass list-processing programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "599--610",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926454",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Madhusudan:2011:DLC,
  author =       "P. Madhusudan and Gennaro Parlato and Xiaokang Qiu",
  title =        "Decidable logics combining heap structures and data",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "611--622",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926455",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Joisha:2011:TEA,
  author =       "Pramod G. Joisha and Robert S. Schreiber and
                 Prithviraj Banerjee and Hans J. Boehm and Dhruva R.
                 Chakrabarti",
  title =        "A technique for the effective and automatic reuse of
                 classical compiler optimizations on multithreaded
                 code",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "1",
  pages =        "623--636",
  month =        jan,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1925844.1926457",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Jan 26 15:06:39 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lammel:2011:HGS,
  author =       "Ralf L{\"a}mmel",
  title =        "The hitchhiker's guide to software languages",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "1--2",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868295",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There is only that much space in the CS curriculum,
                 and there are always new subjects that should be
                 accommodated by the curriculum. For instance, in our
                 community, we would want all graduates to leave
                 university with a modest background in technical
                 spaces, software languages, and meta-programming; also,
                 with conceptually informed and reasonably timeless
                 skills to efficiently master related programming
                 techniques and technologies. In reality, the curricula
                 of few CS departments meet this expectation. In this
                 talk, I will discuss such curricula-related
                 expectations of our community and the suboptimal
                 situation at CS departments---as perceive them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Erwig:2011:LSV,
  author =       "Martin Erwig",
  title =        "A language for software variation research",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "3--12",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868296",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Managing variation is an important problem in software
                 engineering that takes different forms, ranging from
                 version control and configuration management to
                 software product lines. In this paper, I present our
                 recent work on the choice calculus, a fundamental
                 representation for software variation that can serve as
                 a common language of discourse for variation research,
                 filling a role similar to lambda calculus in
                 programming language research. After motivating the
                 design of the choice calculus and sketching its
                 semantics, I will discuss several potential application
                 areas.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Clarke:2011:ADM,
  author =       "Dave Clarke and Michiel Helvensteijn and Ina
                 Schaefer",
  title =        "Abstract delta modeling",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "13--22",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868298",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Delta modeling is an approach to facilitate automated
                 product derivation for software product lines. It is
                 based on a set of deltas specifying modifications that
                 are incrementally applied to a core product. The
                 applicability of deltas depends on feature-dependent
                 conditions. This paper presents abstract delta
                 modeling, which explores delta modeling from an
                 abstract, algebraic perspective. Compared to previous
                 work, we take a more flexible approach with respect to
                 conflicts between modifications and introduce the
                 notion of conflict-resolving deltas. We present
                 conditions on the structure of deltas to ensure
                 unambiguous product generation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ryssel:2011:AVP,
  author =       "Uwe Ryssel and Joern Ploennigs and Klaus Kabitzsch",
  title =        "Automatic variation-point identification in
                 function-block-based models",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "23--32",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868299",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Function-block-based modeling is often used to develop
                 embedded systems, particularly as system variants can
                 be developed rapidly from existing modules. Generative
                 approaches can simplify the handling and development of
                 the resulting high variety of function-block-based
                 models. But they often require the development of new
                 generic models that do not utilize existing ones.
                 Reusing existing models will significantly decrease the
                 effort to apply generative programming. This work
                 introduces an automatic approach to recognize variants
                 in a set of models and identify the variation points
                 and their dependencies within variants.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sincero:2011:EEA,
  author =       "Julio Sincero and Reinhard Tartler and Daniel Lohmann
                 and Wolfgang Schr{\"o}der-Preikschat",
  title =        "Efficient extraction and analysis of
                 preprocessor-based variability",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "33--42",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868300",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The C Preprocessor (CPP) is the tool of choice for the
                 implementation of variability in many large-scale
                 configurable software projects. Linux, probably the
                 most-configurable piece of software ever, employs more
                 than 10,000 preprocessor variables for this purpose.
                 However, this de-facto variability tends to be ``hidden
                 in the code''; which on the long term leads to
                 variability defects, such as dead code or
                 inconsistencies with respect to the intended (modeled)
                 variability of the software. This calls for tool
                 support for the efficient extraction of (and reasoning
                 over) CPP-based variability. We suggest a novel
                 approach to extract CPP-based variability. Our tool
                 transforms CPP-based variability in O(n) complexity
                 into a propositional formula that ``mimics'' all valid
                 effects of conditional compilation and can be analyzed
                 with standard SAT or BDD packages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Middelkoop:2011:ITI,
  author =       "Arie Middelkoop and Atze Dijkstra and S. Doaitse
                 Swierstra",
  title =        "Iterative type inference with attribute grammars",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "43--52",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868302",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Type inference is the process of constructing a typing
                 derivation while gradually discovering type
                 information. During this process, inference algorithms
                 typically make subtle decisions based on the derivation
                 constructed so far. Because a typing derivation is a
                 decorated tree we aim to use attribute grammars as the
                 main implementation tool. Unfortunately, we can neither
                 express iteration, nor express decisions based on
                 intermediate derivations in such grammars. We present
                 the language ruler-front, a conservative extension to
                 ordered attribute grammars, that deals with the
                 aforementioned problems. We show why this extension is
                 suitable for the description of constraint-based
                 inference algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Krieger:2011:AES,
  author =       "Matthias P. Krieger and Alexander Knapp and Burkhart
                 Wolff",
  title =        "Automatic and efficient simulation of operation
                 contracts",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "53--62",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868303",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Operation contracts consisting of pre- and
                 postconditions are a well-known means of specifying
                 operations. In this paper we deal with the problem of
                 operation contract simulation, i.e., determining
                 operation results satisfying the postconditions based
                 on input data supplied by the user; simulating
                 operation contracts is an important technique for
                 requirements validation and prototyping. Current
                 approaches to operation contract simulation exhibit
                 poor performance for large sets of input data or
                 require additional guidance from the user. We show how
                 these problems can be alleviated and describe an
                 efficient as well as fully automatic approach. It is
                 implemented in our tool OCLexec that generates from
                 UML/OCL operation contracts corresponding Java
                 implementations which call a constraint solver at
                 runtime.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Long:2011:IIM,
  author =       "Yuheng Long and Sean L. Mooney and Tyler Sondag and
                 Hridesh Rajan",
  title =        "Implicit invocation meets safe, implicit concurrency",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "63--72",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868304",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Writing correct and efficient concurrent programs
                 still remains a challenge. Explicit concurrency is
                 difficult, error prone, and creates code which is hard
                 to maintain and debug. This type of concurrency also
                 treats modular program design and concurrency as
                 separate goals, where modularity often suffers. To
                 solve these problems, we are designing a new language
                 that we call Panini. In this paper, we focus on
                 Panini's asynchronous, typed events which reconcile the
                 modularity goal promoted by the implicit invocation
                 design style with the concurrency goal of exposing
                 potential concurrency between the execution of subjects
                 and observers. Since modularity is improved and
                 concurrency is implicit in Panini, programs are easier
                 to reason about and maintain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Navas:2011:CBR,
  author =       "Juan F. Navas and Jean-Philippe Babau and Jacques
                 Pulou",
  title =        "A component-based run-time evolution infrastructure
                 for resource-constrained embedded systems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "73--82",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868306",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper deals with embedded systems software and
                 the modification of its architecture and behavior at
                 execution-time. Incautious implementation of these
                 features demands both heavy memory and performance
                 overrun. To accomplish such software evolution
                 activities in resource-constrained embedded systems, we
                 propose a component-based run-time evolution
                 infrastructure that reconciles richness of evolution
                 alternatives and performance requirements. Our proposal
                 is based on off-site components reifications, which are
                 representations of components that allow us to treat
                 evolution concerns remotely. Hence, the workload to be
                 processed by the embedded device is alleviated.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hofer:2011:MDS,
  author =       "Christian Hofer and Klaus Ostermann",
  title =        "Modular domain-specific language components in
                 {Scala}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "83--92",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868307",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programs in domain-specific embedded languages (DSELs)
                 can be represented in the host language in different
                 ways, for instance implicitly as libraries, or
                 explicitly in the form of abstract syntax trees. Each
                 of these representations has its own strengths and
                 weaknesses. The implicit approach has good
                 composability properties, whereas the explicit approach
                 allows more freedom in making syntactic program
                 transformations. Traditional designs for DSELs fix the
                 form of representation, which means that it is not
                 possible to choose the best representation for a
                 particular interpretation or transformation. We propose
                 a new design for implementing DSELs in Scala which
                 makes it easy to use different program representations
                 at the same time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wolfinger:2011:AGP,
  author =       "Reinhard Wolfinger and Markus L{\"o}berbauer and
                 Markus Jahn and Hanspeter M{\"o}ssenb{\"o}ck",
  title =        "Adding genericity to a plug-in framework",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "93--102",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868308",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Plug-in components are a means for making feature-rich
                 applications customizable. Combined with plug-and-play
                 composition, end users can assemble customized
                 applications without programming. If plug-and-play
                 composition is also dynamic, applications can be
                 reconfigured on the fly to load only components the
                 user needs for his current work. We have created
                 Plux.NET, a plug-in framework that supports dynamic
                 plug-and-play composition. The basis for plug-and-play
                 in Plux is the composer which replaces programmatic
                 composition by automatic composition. Components just
                 specify their requirements and provisions using
                 metadata. The composer then assembles the components
                 based on that metadata by matching requirements and
                 provisions. When the composer needs to reuse
                 general-purpose components in different parts of an
                 application, the component model requires genericity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Schulze:2011:CCF,
  author =       "Sandro Schulze and Sven Apel and Christian
                 K{\"a}stner",
  title =        "Code clones in feature-oriented software product
                 lines",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "103--112",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868310",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Some limitations of object-oriented mechanisms are
                 known to cause code clones (e.g., extension using
                 inheritance). Novel programming paradigms such as
                 feature-oriented programming (FOP) aim at alleviating
                 these limitations. However, it is an open issue whether
                 FOP is really able to avoid code clones or whether it
                 even facilitates (FOP-related) clones. To address this
                 issue, we conduct an empirical analysis on ten
                 feature-oriented software product lines with respect to
                 code cloning. We found that there is a considerable
                 number of clones in feature-oriented software product
                 lines and that a large fraction of these clones is
                 FOP-related (i.e., caused by limitations of
                 feature-oriented mechanisms).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tanter:2011:CDA,
  author =       "{\'E}ric Tanter and Philippe Moret and Walter Binder
                 and Danilo Ansaloni",
  title =        "Composition of dynamic analysis aspects",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "113--122",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868311",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Aspect-oriented programming provides a convenient
                 high-level model to define several kinds of dynamic
                 analyses, in particular thanks to recent advances in
                 exhaustive weaving in core libraries. Casting dynamic
                 analyses as aspects allows the use of a single weaving
                 infrastructure to apply different analyses to the same
                 base program, simultaneously. However, even if dynamic
                 analysis aspects are mutually independent, their mere
                 presence perturbs the observations of others: this is
                 due to the fact that aspectual computation is
                 potentially visible to all aspects. Because current
                 aspect composition approaches do not address this kind
                 of computational interference, combining different
                 analysis aspects yields at best unpredictable
                 results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wurthinger:2011:AED,
  author =       "Thomas W{\"u}rthinger and Walter Binder and Danilo
                 Ansaloni and Philippe Moret and Hanspeter
                 M{\"o}ssenb{\"o}ck",
  title =        "Applications of enhanced dynamic code evolution for
                 {Java} in {GUI} development and dynamic aspect-oriented
                 programming",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "123--126",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868312",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "While dynamic code evolution in object-oriented
                 systems is an important feature supported by dynamic
                 languages, there is currently only limited support for
                 dynamic code evolution in high-performance,
                 state-of-the-art runtime systems for statically typed
                 languages, such as the Java Virtual Machine. In this
                 tool demonstration, we present the Dynamic Code
                 Evolution VM, which is based on a recent version of
                 Oracle's state-of-the-art Java HotSpot(TM) VM and
                 allows unlimited changes to loaded classes at runtime.
                 Based on the Dynamic Code Evolution VM, we developed an
                 enhanced version of the Mantisse GUI builder (which is
                 part of the NetBeans IDE) that allows adding GUI
                 components without restarting the application under
                 development.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Rompf:2011:LMS,
  author =       "Tiark Rompf and Martin Odersky",
  title =        "Lightweight modular staging: a pragmatic approach to
                 runtime code generation and compiled {DSLs}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "127--136",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868314",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software engineering demands generality and
                 abstraction, performance demands specialization and
                 concretization. Generative programming can provide
                 both, but the effort required to develop high-quality
                 program generators likely offsets their benefits, even
                 if a multi-stage programming language is used. We
                 present lightweight modular staging, a library-based
                 multi-stage programming approach that breaks with the
                 tradition of syntactic quasi-quotation and instead uses
                 only types to distinguish between binding times.
                 Through extensive use of component technology,
                 lightweight modular staging makes an optimizing
                 compiler framework available at the library level,
                 allowing programmers to tightly integrate
                 domain-specific abstractions and optimizations into the
                 generation process. We argue that lightweight modular
                 staging enables a form of language virtualization,
                 i.e.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Porkolab:2011:DSL,
  author =       "Zolt{\'a}n Porkolab and {\'A}bel Sinkovics",
  title =        "Domain-specific language integration with compile-time
                 parser generator library",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "137--146",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868315",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Smooth integration of domain-specific languages into a
                 general purpose host language requires absorbing of
                 domain code written in arbitrary syntax. The
                 integration should cause minimal syntactical and
                 semantic overhead and introduce minimal dependency on
                 external tools. In this paper we discuss a DSL
                 integration technique for the C++ programming language.
                 The solution is based on compile-time parsing of the
                 DSL code. The parser generator is a C++ template
                 metaprogram reimplementation of a runtime Haskell
                 parser generator library. The full parsing phase is
                 executed when the host program is compiled. The library
                 uses only standard C++ language features, thus our
                 solution is highly portable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Atkinson:2011:ACT,
  author =       "Kevin Atkinson and Matthew Flatt and Gary Lindstrom",
  title =        "{ABI} compatibility through a customizable language",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "147--156",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868316",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "ZL is a C++-compatible language in which high-level
                 constructs, such as classes, are defined using macros
                 over a C-like core language. This approach makes many
                 parts of the language easily customizable. For example,
                 since the class construct can be defined using macros,
                 a programmer can have complete control over the memory
                 layout of objects. Using this capability, a programmer
                 can mitigate certain problems in software evolution
                 such as fragile ABIs (Application Binary Interfaces)
                 due to software changes and incompatible ABIs due to
                 compiler changes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bordignon:2011:MBK,
  author =       "Mirko Bordignon and Ulrik Pagh Schultz and Kasper
                 Stoy",
  title =        "Model-based kinematics generation for modular
                 mechatronic toolkits",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "157--166",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868318",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modular robots are mechatronic devices that enable the
                 construction of highly versatile and flexible robotic
                 systems whose mechanical structure can be dynamically
                 modified. The key feature that enables this dynamic
                 modification is the capability of the individual
                 modules to connect to each other in multiple ways and
                 thus generate a number of different mechanical systems,
                 in contrast with the monolithics fixed structure of
                 conventional robots. The mechatronic flexibility,
                 however, complicates the development of models and
                 programming abstractions for modular robots, since
                 manually describing and enumerating the full set of
                 possible interconnections is tedious and error-prone
                 for real-world robots. In order to allow for a general
                 formulation of spatial abstractions for modular robots
                 and to ensure correct and streamlined generation of
                 code dependent on mechanical properties, we have
                 developed the Modular Mechatronics Modelling Language
                 (M3L).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Miao:2011:ITC,
  author =       "Weiyu Miao and Jeremy G. Siek",
  title =        "Incremental type-checking for type-reflective
                 metaprograms",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "167--176",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868319",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Garcia introduces a calculus for type-reflective
                 metaprogramming that provides much of the power and
                 flexibility of C++ templates and solves many of its
                 problems. However, one of the problems that remains is
                 that the residual program is not type checked until
                 after meta computation is complete. Ideally, one would
                 like the type system of the metaprogram to also
                 guarantee that the residual program will type check, as
                 is the case in MetaML. However, in a language with
                 type-reflective metaprogramming, type expressions in
                 the residual program may be the result of meta
                 computation, making the MetaML guarantee next to
                 impossible to achieve.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Grech:2011:JGE,
  author =       "Neville Grech and Julian Rathke and Bernd Fischer",
  title =        "{JEqualityGen}: generating equality and hashing
                 methods",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "2",
  pages =        "177--186",
  month =        feb,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1942788.1868320",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Feb 14 16:37:34 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Manually implementing equals (for object comparisons)
                 and hashCode (for object hashing) methods in large
                 software projects is tedious and error-prone. This is
                 due to many special cases, such as field shadowing,
                 comparison between different types, or cyclic object
                 graphs. Here, we present JEqualityGen, a source code
                 generator that automatically derives implementations of
                 these methods. JEqualityGen proceeds in two states: it
                 first uses source code reflection in MetaAspectJ to
                 generate aspects that contain the method
                 implementations, before it uses weaving on the bytecode
                 level to insert these into the target application.
                 JEqualityGen generates not only correct, but efficient
                 source code that on a typical large-scale Java
                 application exhibits a performance improvement of more
                 than two orders of magnitude in the equality operations
                 generated, compared to an existing system based on
                 runtime reflection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Larus:2011:CWC,
  author =       "James R. Larus",
  title =        "The cloud will change everything",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "1--2",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950367",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Yuan:2011:ISD,
  author =       "Ding Yuan and Jing Zheng and Soyeon Park and Yuanyuan
                 Zhou and Stefan Savage",
  title =        "Improving software diagnosability via log
                 enhancement",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "3--14",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950369",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Veeraraghavan:2011:DPS,
  author =       "Kaushik Veeraraghavan and Dongyoon Lee and Benjamin
                 Wester and Jessica Ouyang and Peter M. Chen and Jason
                 Flinn and Satish Narayanasamy",
  title =        "{DoublePlay}: parallelizing sequential logging and
                 replay",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "15--26",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950370",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Casper:2011:HAT,
  author =       "Jared Casper and Tayo Oguntebi and Sungpack Hong and
                 Nathan G. Bronson and Christos Kozyrakis and Kunle
                 Olukotun",
  title =        "Hardware acceleration of transactional memory on
                 commodity systems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "27--38",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950372",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Dalessandro:2011:HNC,
  author =       "Luke Dalessandro and Fran{\c{c}}ois Carouge and Sean
                 White and Yossi Lev and Mark Moir and Michael L. Scott
                 and Michael F. Spear",
  title =        "{Hybrid NOrec}: a case study in the effectiveness of
                 best effort hardware transactional memory",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "39--52",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950373",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Singh:2011:EPS,
  author =       "Abhayendra Singh and Daniel Marino and Satish
                 Narayanasamy and Todd Millstein and Madan Musuvathi",
  title =        "Efficient processor support for {DRFx}, a memory model
                 with exceptions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "53--66",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950375",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Devietti:2011:RRC,
  author =       "Joseph Devietti and Jacob Nelson and Tom Bergan and
                 Luis Ceze and Dan Grossman",
  title =        "{RCDC}: a relaxed consistency deterministic computer",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "67--78",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950376",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Burnim:2011:SCS,
  author =       "Jacob Burnim and George Necula and Koushik Sen",
  title =        "Specifying and checking semantic atomicity for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "79--90",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950377",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Volos:2011:MLP,
  author =       "Haris Volos and Andres Jaan Tack and Michael M.
                 Swift",
  title =        "{Mnemosyne}: lightweight persistent memory",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "91--104",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950379",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Coburn:2011:NHM,
  author =       "Joel Coburn and Adrian M. Caulfield and Ameen Akel and
                 Laura M. Grupp and Rajesh K. Gupta and Ranjit Jhala and
                 Steven Swanson",
  title =        "{NV-Heaps}: making persistent objects fast and safe
                 with next-generation, non-volatile memories",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "105--118",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950380",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Schupbach:2011:DLA,
  author =       "Adrian Sch{\"u}pbach and Andrew Baumann and Timothy
                 Roscoe and Simon Peter",
  title =        "A declarative language approach to device
                 configuration",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "119--132",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950382",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Ryzhyk:2011:IDD,
  author =       "Leonid Ryzhyk and John Keys and Balachandra Mirla and
                 Arun Raghunath and Mona Vij and Gernot Heiser",
  title =        "Improved device driver reliability through hardware
                 verification reuse",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "133--144",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950383",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Hashmi:2011:CNI,
  author =       "Atif Hashmi and Andrew Nere and James Jamal Thomas and
                 Mikko Lipasti",
  title =        "A case for neuromorphic {ISAs}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "145--158",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950385",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Ransford:2011:MSS,
  author =       "Benjamin Ransford and Jacob Sorber and Kevin Fu",
  title =        "{Mementos}: system support for long-running
                 computation on {RFID}-scale devices",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "159--170",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950386",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Koukoumidis:2011:PC,
  author =       "Emmanouil Koukoumidis and Dimitrios Lymberopoulos and
                 Karin Strauss and Jie Liu and Doug Burger",
  title =        "Pocket cloudlets",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "171--184",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950387",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Sharma:2011:BMS,
  author =       "Navin Sharma and Sean Barker and David Irwin and
                 Prashant Shenoy",
  title =        "{Blink}: managing server clusters on intermittent
                 power",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "185--198",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950389",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Hoffmann:2011:DKR,
  author =       "Henry Hoffmann and Stelios Sidiroglou and Michael
                 Carbin and Sasa Misailovic and Anant Agarwal and Martin
                 Rinard",
  title =        "Dynamic knobs for responsive power-aware computing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "199--212",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950390",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Liu:2011:FSD,
  author =       "Song Liu and Karthik Pattabiraman and Thomas
                 Moscibroda and Benjamin G. Zorn",
  title =        "{Flikker}: saving {DRAM} refresh-power through
                 critical data partitioning",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "213--224",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950391",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Deng:2011:MAL,
  author =       "Qingyuan Deng and David Meisner and Luiz Ramos and
                 Thomas F. Wenisch and Ricardo Bianchini",
  title =        "{MemScale}: active low-power modes for main memory",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "225--238",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950392",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Gao:2011:TMH,
  author =       "Qi Gao and Wenbin Zhang and Zhezhe Chen and Mai Zheng
                 and Feng Qin",
  title =        "{2ndStrike}: toward manifesting hidden concurrency
                 typestate bugs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "239--250",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950394",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Zhang:2011:CDC,
  author =       "Wei Zhang and Junghee Lim and Ramya Olichandran and
                 Joel Scherpelz and Guoliang Jin and Shan Lu and Thomas
                 Reps",
  title =        "{ConSeq}: detecting concurrency bugs through
                 sequential errors",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "251--264",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950395",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Chipounov:2011:SPV,
  author =       "Vitaly Chipounov and Volodymyr Kuznetsov and George
                 Candea",
  title =        "{S2E}: a platform for in-vivo multi-path analysis of
                 software systems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "265--278",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950396",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Hofmann:2011:EOS,
  author =       "Owen S. Hofmann and Alan M. Dunn and Sangman Kim and
                 Indrajit Roy and Emmett Witchel",
  title =        "Ensuring operating system kernel integrity with
                 {OSck}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "279--290",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950398",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Porter:2011:RLT,
  author =       "Donald E. Porter and Silas Boyd-Wickizer and Jon
                 Howell and Reuben Olinsky and Galen C. Hunt",
  title =        "Rethinking the library {OS} from the top down",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "291--304",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950399",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Palix:2011:FLT,
  author =       "Nicolas Palix and Ga{\"e}l Thomas and Suman Saha and
                 Christophe Calv{\`e}s and Julia Lawall and Gilles
                 Muller",
  title =        "Faults in {Linux}: ten years later",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "305--318",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950401",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Esmaeilzadeh:2011:LBL,
  author =       "Hadi Esmaeilzadeh and Ting Cao and Yang Xi and Stephen
                 M. Blackburn and Kathryn S. McKinley",
  title =        "Looking back on the language and hardware revolutions:
                 measured power, performance, and scaling",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "319--332",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950402",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Nguyen:2011:SCS,
  author =       "Donald Nguyen and Keshav Pingali",
  title =        "Synthesizing concurrent schedulers for irregular
                 algorithms",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "333--344",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950404",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Hoang:2011:ECT,
  author =       "Giang Hoang and Robby Bruce Findler and Russ Joseph",
  title =        "Exploring circuit timing-aware language and
                 compilation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "345--356",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950405",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Farhad:2011:OAM,
  author =       "Sardar M. Farhad and Yousun Ko and Bernd Burgstaller
                 and Bernhard Scholz",
  title =        "Orchestration by approximation: mapping stream
                 programs onto multicore architectures",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "357--368",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950406",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Zhang:2011:FED,
  author =       "Eddy Z. Zhang and Yunlian Jiang and Ziyu Guo and Kai
                 Tian and Xipeng Shen",
  title =        "On-the-fly elimination of dynamic irregularities for
                 {GPU} computing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "369--380",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950408",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Hormati:2011:SPS,
  author =       "Amir H. Hormati and Mehrzad Samadi and Mark Woh and
                 Trevor Mudge and Scott Mahlke",
  title =        "{Sponge}: portable stream programming on graphics
                 engines",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "381--392",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950409",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Kamruzzaman:2011:ICP,
  author =       "Md Kamruzzaman and Steven Swanson and Dean M.
                 Tullsen",
  title =        "Inter-core prefetching for multicore processors using
                 migrating helper threads",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "393--404",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950411",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Hayashizaki:2011:IPT,
  author =       "Hiroshige Hayashizaki and Peng Wu and Hiroshi Inoue
                 and Mauricio J. Serrano and Toshio Nakatani",
  title =        "Improving the performance of trace-based systems by
                 false loop filtering",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "3",
  pages =        "405--418",
  month =        mar,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1961296.1950412",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:08 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '11 conference proceedings",
}

@Article{Bala:2011:DTD,
  author =       "Vasanth Bala and Evelyn Duesterwald and Sanjeev
                 Banerjia",
  title =        "{Dynamo}: a transparent dynamic optimization system",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "4",
  pages =        "41--52",
  month =        apr,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1988042.1988044",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:07 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe the design and implementation of Dynamo, a
                 software dynamic optimization system that is capable of
                 transparently improving the performance of a native
                 instruction stream as it executes on the processor. The
                 input native instruction stream to Dynamo can be
                 dynamically generated (by a JIT for example), or it can
                 come from the execution of a statically compiled native
                 binary. This paper evaluates the Dynamo system in the
                 latter, more challenging situation, in order to
                 emphasize the limits, rather than the potential, of the
                 system. Our experiments demonstrate that even
                 statically optimized native binaries can be accelerated
                 Dynamo, and often by a significant degree.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Claessen:2011:QLT,
  author =       "Koen Claessen and John Hughes",
  title =        "{QuickCheck}: a lightweight tool for random testing of
                 {Haskell} programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "4",
  pages =        "53--64",
  month =        apr,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1988042.1988046",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:07 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "QuickCheck is a tool which aids the Haskell programmer
                 in formulating and testing properties of programs.
                 Properties are described as Haskell functions, and can
                 be automatically tested on random input, but it is also
                 possible to define custom test data generators. We
                 present a number of case studies, in which the tool was
                 successfully used, and also point out some pitfalls to
                 avoid. Random testing is especially suitable for
                 functional programs because properties can be stated at
                 a fine grain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Arnold:2011:AOJ,
  author =       "Matthew Arnold and Stephen Fink and David Grove and
                 Michael Hind and Peter F. Sweeney",
  title =        "Adaptive optimization in the {Jalapeno JVM}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "4",
  pages =        "65--83",
  month =        apr,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1988042.1988048",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:07 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Future high-performance virtual machines will improve
                 performance through sophisticated online
                 feedback-directed optimizations. This paper presents
                 the architecture of the Jalapeno Adaptive Optimization
                 System, a system to support leading-edge virtual
                 machine technology and enable ongoing research on
                 online feedback-directed optimizations. We describe the
                 extensible system architecture, based on a federation
                 of threads with asynchronous communication. We present
                 an implementation of the general architecture that
                 supports adaptive multi-level optimization based purely
                 on statistical sampling. We empirically demonstrate
                 that this profiling technique has low overhead and can
                 improve startup and steady-state performance, even
                 without the presence of online feedback-directed
                 optimizations. The paper also describes and evaluates
                 an online feedback-directed inlining optimization based
                 on statistical edge sampling.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ishtiaq:2011:BAL,
  author =       "Samin Ishtiaq and Peter W. O'Hearn",
  title =        "{BI} as an assertion language for mutable data
                 structures",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "4",
  pages =        "84--96",
  month =        apr,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1988042.1988050",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 24 10:55:07 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reynolds has developed a logic for reasoning about
                 mutable data structures in which the pre- and
                 postconditions are written in an intuitionistic logic
                 enriched with a spatial form of conjunction. We
                 investigate the approach from the point of view of the
                 logic BI of bunched implications of O'Hearn and Pym. We
                 begin by giving a model in which the law of the
                 excluded middle holds, thus showing that the approach
                 is compatible with classical logic. The relationship
                 between the intuitionistic and classical versions of
                 the system is established by a translation, analogous
                 to a translation from intuitionistic logic into the
                 modal logic S4.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Virlet:2011:SSB,
  author =       "Bruno Virlet and Xing Zhou and Jean Pierre Giacalone
                 and Bob Kuhn and Maria J. Garzaran and David Padua",
  title =        "Scheduling of stream-based real-time applications for
                 heterogeneous systems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "1--10",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967679",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Chattopadhyay:2011:SBS,
  author =       "Sudipta Chattopadhyay and Abhik Roychoudhury",
  title =        "Static bus schedule aware scratchpad allocation in
                 multiprocessors",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "11--20",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967680",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Albert:2011:TLA,
  author =       "Elvira Albert and Puri Arenas and Samir Genaim and
                 Damiano Zanardini",
  title =        "Task-level analysis for a language with async\slash
                 finish parallelism",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "21--30",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967681",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Chang:2011:LCW,
  author =       "Li-Pin Chang and Li-Chun Huang",
  title =        "A low-cost wear-leveling algorithm for block-mapping
                 solid-state disks",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "31--40",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967683",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multilevel flash memory cells double or even triple
                 storage density, producing affordable solid-state disks
                 for end users. However, flash lifetime is becoming a
                 critical issue in the popularity of solid-state disks.
                 Wear-leveling methods can prevent flash-storage devices
                 from prematurely retiring any portions of flash memory.
                 The two practical challenges of wear-leveling design
                 are implementation cost and tuning complexity. This
                 study proposes a new wear-leveling design that features
                 both simplicity and adaptiveness. This design requires
                 no new data structures, but utilizes the intelligence
                 available in sector-translating algorithms. Using an
                 on-line tuning method, this design adaptively tunes
                 itself to reach good balance between wear evenness and
                 overhead. A series of trace-driven simulations show
                 that the proposed design outperforms a competitive
                 existing design in terms of wear evenness and overhead
                 reduction. This study also presents a prototype that
                 proves the feasibility of this wear-leveling design in
                 real solid-state disks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Saha:2011:AIS,
  author =       "Suman Saha and Julia Lawall and Gilles Muller",
  title =        "An approach to improving the structure of
                 error-handling code in the {Linux} kernel",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "41--50",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967684",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Gray:2011:TCE,
  author =       "Ian Gray and Neil C. Audsley",
  title =        "Targeting complex embedded architectures by combining
                 the multicore communications {API} ({{\tt mcapi}}) with
                 compile-time virtualisation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "51--60",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967685",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Benveniste:2011:DRT,
  author =       "Albert Benveniste and Timothy Bourke and Beno{\^\i}t
                 Caillaud and Marc Pouzet",
  title =        "Divide and recycle: types and compilation for a hybrid
                 synchronous language",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "61--70",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967687",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Gamatie:2011:SAS,
  author =       "Abdoulaye Gamatie and Laure Gonnord",
  title =        "Static analysis of synchronous programs in signal for
                 efficient design of multi-clocked embedded systems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "71--80",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967688",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Berthier:2011:SPD,
  author =       "Nicolas Berthier and Florence Maraninchi and Laurent
                 Mounier",
  title =        "Synchronous programming of device drivers for global
                 resource control in embedded operating systems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "81--90",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967689",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Wang:2011:DBM,
  author =       "Man Wang and Zhiyuan Li and Feng Li and Xiaobing Feng
                 and Saurabh Bagchi and Yung-Hsiang Lu",
  title =        "Dependence-based multi-level tracing and replay for
                 wireless sensor networks debugging",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "91--100",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967691",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Thomas:2011:LOS,
  author =       "Johnson J. Thomas and Sebastian Fischmeister and
                 Deepak Kumar",
  title =        "Lowering overhead in sampling-based execution
                 monitoring and tracing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "101--110",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967692",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Navabpour:2011:SDT,
  author =       "Samaneh Navabpour and Borzoo Bonakdarpour and
                 Sebastian Fischmeister",
  title =        "Software debugging and testing using the abstract
                 diagnosis theory",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "111--120",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967693",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Cullmann:2011:CPA,
  author =       "Christoph Cullmann",
  title =        "Cache persistence analysis: a novel approachtheory and
                 practice",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "121--130",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967695",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Sarkar:2011:PTM,
  author =       "Abhik Sarkar and Frank Mueller and Harini Ramaprasad",
  title =        "Predictable task migration for locked caches in
                 multi-core systems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "131--140",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967696",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Althaus:2011:PEP,
  author =       "Ernst Althaus and Sebastian Altmeyer and Rouven
                 Naujoks",
  title =        "Precise and efficient parametric path analysis",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "141--150",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967697",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Jang:2011:ISA,
  author =       "Choonki Jang and Jungwon Kim and Jaejin Lee and
                 Hee-Seok Kim and Dong-Hoon Yoo and Sukjin Kim and
                 Hong-Seok Kim and Soojung Ryu",
  title =        "An instruction-scheduling-aware data partitioning
                 technique for coarse-grained reconfigurable
                 architectures",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "151--160",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967699",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Bhagat:2011:GPP,
  author =       "Indu Bhagat and Enric Gibert and Jes{\'u}s S{\'a}nchez
                 and Antonio Gonz{\'a}lez",
  title =        "Global productiveness propagation: a code optimization
                 technique to speculatively prune useless narrow
                 computations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "5",
  pages =        "161--170",
  month =        may,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2016603.1967700",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Aug 18 13:30:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '10 conference proceedings",
}

@Article{Prabhu:2011:CSL,
  author =       "Prakash Prabhu and Soumyadeep Ghosh and Yun Zhang and
                 Nick P. Johnson and David I. August",
  title =        "Commutative set: a language extension for implicit
                 parallel programming",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "1--11",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993500",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pingali:2011:TPA,
  author =       "Keshav Pingali and Donald Nguyen and Milind Kulkarni
                 and Martin Burtscher and M. Amber Hassaan and Rashid
                 Kaleem and Tsung-Hsien Lee and Andrew Lenharth and
                 Roman Manevich and Mario M{\'e}ndez-Lojo and Dimitrios
                 Prountzos and Xin Sui",
  title =        "The tao of parallelism in algorithms",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "12--25",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993501",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Raman:2011:POU,
  author =       "Arun Raman and Hanjun Kim and Taewook Oh and Jae W.
                 Lee and David I. August",
  title =        "Parallelism orchestration using {DoPE}: the degree of
                 parallelism executive",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "26--37",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993502",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hawkins:2011:DRS,
  author =       "Peter Hawkins and Alex Aiken and Kathleen Fisher and
                 Martin Rinard and Mooly Sagiv",
  title =        "Data representation synthesis",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "38--49",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993504",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gulwani:2011:SGC,
  author =       "Sumit Gulwani and Vijay Anand Korthikanti and Ashish
                 Tiwari",
  title =        "Synthesizing geometry constructions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "50--61",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993505",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gulwani:2011:SLF,
  author =       "Sumit Gulwani and Susmit Jha and Ashish Tiwari and
                 Ramarathnam Venkatesan",
  title =        "Synthesis of loop-free programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "62--73",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993506",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bohm:2011:GJT,
  author =       "Igor B{\"o}hm and Tobias J. K. Edler von Koch and
                 Stephen C. Kyle and Bj{\"o}rn Franke and Nigel Topham",
  title =        "Generalized just-in-time trace compilation using a
                 parallel task farm in a dynamic binary translator",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "74--85",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993508",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jung:2011:BES,
  author =       "Changhee Jung and Silvius Rus and Brian P. Railing and
                 Nathan Clark and Santosh Pande",
  title =        "{Brainy}: effective selection of data structures",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "86--97",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993509",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhou:2011:SBA,
  author =       "Hucheng Zhou and Wenguang Chen and Fred Chow",
  title =        "An {SSA}-based algorithm for optimal speculative code
                 motion under an execution profile",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "98--108",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2011:CHD,
  author =       "Xun Li and Mohit Tiwari and Jason K. Oberg and Vineeth
                 Kashyap and Frederic T. Chong and Timothy Sherwood and
                 Ben Hardekopf",
  title =        "{Caisson}: a hardware description language for secure
                 information flow",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "109--120",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Murray:2011:SAO,
  author =       "Derek Gordon Murray and Michael Isard and Yuan Yu",
  title =        "{Steno}: automatic optimization of declarative
                 queries",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "121--131",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993513",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tobin-Hochstadt:2011:LL,
  author =       "Sam Tobin-Hochstadt and Vincent St-Amour and Ryan
                 Culpepper and Matthew Flatt and Matthias Felleisen",
  title =        "Languages as libraries",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "132--141",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993514",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jablin:2011:ACG,
  author =       "Thomas B. Jablin and Prakash Prabhu and James A.
                 Jablin and Nick P. Johnson and Stephen R. Beard and
                 David I. August",
  title =        "Automatic {CPU--GPU} communication management and
                 optimization",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "142--151",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993516",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Prasad:2011:ACM,
  author =       "Ashwin Prasad and Jayvant Anantpur and R.
                 Govindarajan",
  title =        "Automatic compilation of {MATLAB} programs for
                 synergistic execution on heterogeneous processors",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "152--163",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993517",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sampson:2011:EAD,
  author =       "Adrian Sampson and Werner Dietl and Emily Fortuna and
                 Danushen Gnanapragasam and Luis Ceze and Dan Grossman",
  title =        "{EnerJ}: approximate data types for safe and general
                 low-power computation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "164--174",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sarkar:2011:UPM,
  author =       "Susmit Sarkar and Peter Sewell and Jade Alglave and
                 Luc Maranget and Derek Williams",
  title =        "Understanding {POWER} multiprocessors",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "175--186",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993520",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kuperstein:2011:PCA,
  author =       "Michael Kuperstein and Martin Vechev and Eran Yahav",
  title =        "Partial-coherence abstractions for relaxed memory
                 models",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "187--198",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993521",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Marino:2011:CSP,
  author =       "Daniel Marino and Abhayendra Singh and Todd Millstein
                 and Madanlal Musuvathi and Satish Narayanasamy",
  title =        "A case for an {SC}-preserving compiler",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "199--210",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The most intuitive memory consistency model for
                 shared-memory multi-threaded programming is sequential
                 consistency (SC). However, current concurrent
                 programming languages support a relaxed model, as such
                 relaxations are deemed necessary for enabling important
                 optimizations. This paper demonstrates that an
                 SC-preserving compiler, one that ensures that every SC
                 behavior of a compiler-generated binary is an SC
                 behavior of the source program, retains most of the
                 performance benefits of an optimizing compiler. The key
                 observation is that a large class of optimizations
                 crucial for performance are either already
                 SC-preserving or can be modified to preserve SC while
                 retaining much of their effectiveness. An SC-preserving
                 compiler, obtained by restricting the optimization
                 phases in LLVM, a state-of-the-art C/C++ compiler,
                 incurs an average slowdown of 3.8\% and a maximum
                 slowdown of 34\% on a set of 30 programs from the
                 SPLASH-2, PARSEC, and SPEC CINT2006 benchmark
                 suites.\par

                 While the performance overhead of preserving SC in the
                 compiler is much less than previously assumed, it might
                 still be unacceptable for certain applications. We
                 believe there are several avenues for improving
                 performance without giving up SC-preservation. In this
                 vein, we observe that the overhead of our SC-preserving
                 compiler arises mainly from its inability to
                 aggressively perform a class of optimizations we
                 identify as eager-load optimizations. This class
                 includes common-subexpression elimination, constant
                 propagation, global value numbering, and common cases
                 of loop-invariant code motion. We propose a notion of
                 interference checks in order to enable eager-load
                 optimizations while preserving SC. Interference checks
                 expose to the compiler a commonly used hardware
                 speculation mechanism that can efficiently detect
                 whether a particular variable has changed its value
                 since last read.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "LLVM compiler suite; sequential consistency (SC)",
}

@Article{Beckman:2011:PMS,
  author =       "Nels E. Beckman and Aditya V. Nori",
  title =        "Probabilistic, modular and scalable inference of
                 typestate specifications",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "211--221",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993524",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kobayashi:2011:PAC,
  author =       "Naoki Kobayashi and Ryosuke Sato and Hiroshi Unno",
  title =        "Predicate abstraction and {CEGAR} for higher-order
                 model checking",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "222--233",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993525",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chlipala:2011:MAV,
  author =       "Adam Chlipala",
  title =        "Mostly-automated verification of low-level programs in
                 computational separation logic",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "234--245",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993526",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lee:2011:TGR,
  author =       "Kyu Hyung Lee and Yunhui Zheng and Nick Sumner and
                 Xiangyu Zhang",
  title =        "Toward generating reducible replay logs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "246--257",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993528",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Godefroid:2011:HOT,
  author =       "Patrice Godefroid",
  title =        "Higher-order test generation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "258--269",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Xu:2011:LHP,
  author =       "Guoqing Xu and Michael D. Bond and Feng Qin and Atanas
                 Rountev",
  title =        "{LeakChaser}: helping programmers narrow down causes
                 of memory leaks",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "270--282",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993530",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yang:2011:FUB,
  author =       "Xuejun Yang and Yang Chen and Eric Eide and John
                 Regehr",
  title =        "Finding and understanding bugs in {C} compilers",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "283--294",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993532",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compilers should be correct. To improve the quality of
                 C compilers, we created Csmith, a randomized test-case
                 generation tool, and spent three years using it to find
                 compiler bugs. During this period we reported more than
                 325 previously unknown bugs to compiler developers.
                 Every compiler we tested was found to crash and also to
                 silently generate wrong code when presented with valid
                 input. In this paper we present our compiler-testing
                 tool and the results of our bug-hunting study. Our
                 first contribution is to advance the state of the art
                 in compiler testing. Unlike previous tools, Csmith
                 generates programs that cover a large subset of C while
                 avoiding the undefined and unspecified behaviors that
                 would destroy its ability to automatically find
                 wrong-code bugs. Our second contribution is a
                 collection of qualitative and quantitative results
                 about the bugs we have found in open-source C
                 compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tristan:2011:EVG,
  author =       "Jean-Baptiste Tristan and Paul Govereau and Greg
                 Morrisett",
  title =        "Evaluating value-graph translation validation for
                 {LLVM}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "295--305",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993533",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sevcik:2011:SOS,
  author =       "Jaroslav Sevc{\'\i}k",
  title =        "Safe optimisations for shared-memory concurrent
                 programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "306--316",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Harris:2011:STT,
  author =       "William R. Harris and Sumit Gulwani",
  title =        "Spreadsheet table transformations from examples",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "317--328",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Meng:2011:SEG,
  author =       "Na Meng and Miryung Kim and Kathryn S. McKinley",
  title =        "Systematic editing: generating program transformations
                 from an example",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "329--342",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993537",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Srivastava:2011:SPO,
  author =       "Varun Srivastava and Michael D. Bond and Kathryn S.
                 McKinley and Vitaly Shmatikov",
  title =        "A security policy oracle: detecting security holes
                 using multiple {API} implementations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "343--354",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ansel:2011:LIS,
  author =       "Jason Ansel and Petr Marchenko and Ulfar Erlingsson
                 and Elijah Taylor and Brad Chen and Derek L. Schuff and
                 David Sehr and Cliff L. Biffle and Bennet Yee",
  title =        "Language-independent sandboxing of just-in-time
                 compilation and self-modifying code",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "355--366",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993540",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zeng:2011:CCH,
  author =       "Qiang Zeng and Dinghao Wu and Peng Liu",
  title =        "{Cruiser}: concurrent heap buffer overflow monitoring
                 using lock-free data structures",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "367--377",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993541",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lucia:2011:IUC,
  author =       "Brandon Lucia and Benjamin P. Wood and Luis Ceze",
  title =        "Isolating and understanding concurrency errors using
                 reconstructed execution fragments",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "378--388",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993543",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jin:2011:AAV,
  author =       "Guoliang Jin and Linhai Song and Wei Zhang and Shan Lu
                 and Ben Liblit",
  title =        "Automated atomicity-violation fixing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "389--400",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993544",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Burnim:2011:NRC,
  author =       "Jacob Burnim and Tayfun Elmas and George Necula and
                 Koushik Sen",
  title =        "{NDSeq}: runtime checking for nondeterministic
                 sequential specifications of parallel correctness",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "401--414",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993545",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jin:2011:GCM,
  author =       "Dongyun Jin and Patrick O'Neil Meredith and Dennis
                 Griffith and Grigore Rosu",
  title =        "Garbage collection for monitoring parametric
                 properties",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "415--424",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993547",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Parr:2011:LFA,
  author =       "Terence Parr and Kathleen Fisher",
  title =        "{LL(*)}: the foundation of the {ANTLR} parser
                 generator",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "425--436",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993548",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the power of Parser Expression Grammars (PEGs)
                 and GLR, parsing is not a solved problem. Adding
                 nondeterminism (parser speculation) to traditional LL
                 and LR parsers can lead to unexpected parse-time
                 behavior and introduces practical issues with error
                 handling, single-step debugging, and side-effecting
                 embedded grammar actions. This paper introduces the
                 LL(*) parsing strategy and an associated grammar
                 analysis algorithm that constructs LL(*) parsing
                 decisions from ANTLR grammars. At parse-time, decisions
                 gracefully throttle up from conventional fixed $ k > =
                 1 $ lookahead to arbitrary lookahead and, finally, fail
                 over to backtracking depending on the complexity of the
                 parsing decision and the input symbols. LL(*) parsing
                 strength reaches into the context-sensitive languages,
                 in some cases beyond what GLR and PEGs can express. By
                 statically removing as much speculation as possible,
                 LL(*) provides the expressivity of PEGs while retaining
                 LL's good error handling and unrestricted grammar
                 actions. Widespread use of ANTLR (over 70,000
                 downloads/year) shows that it is effective for a wide
                 variety of applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jose:2011:CCC,
  author =       "Manu Jose and Rupak Majumdar",
  title =        "Cause clue clauses: error localization using maximum
                 satisfiability",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "437--446",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993550",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Budi:2011:AMA,
  author =       "Aditya Budi and David Lo and Lingxiao Jiang and
                 Lucia",
  title =        "$ k b $-anonymity: a model for anonymized
                 behaviour-preserving test and debugging data",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "447--457",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993551",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Garcia:2011:KRR,
  author =       "Saturnino Garcia and Donghwan Jeon and Christopher M.
                 Louie and Michael Bedford Taylor",
  title =        "{Kremlin}: rethinking and rebooting {{\tt gprof}} for
                 the multicore age",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "458--469",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993553",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many recent parallelization tools lower the barrier
                 for parallelizing a program, but overlook one of the
                 first questions that a programmer needs to answer:
                 which parts of the program should I spend time
                 parallelizing?\par

                 This paper examines Kremlin, an automatic tool that,
                 given a serial version of a program, will make
                 recommendations to the user as to what regions (e.g.
                 loops or functions) of the program to attack first.
                 Kremlin introduces a novel hierarchical critical path
                 analysis and develops a new metric for estimating the
                 potential of parallelizing a region: self-parallelism.
                 We further introduce the concept of a parallelism
                 planner, which provides a ranked order of specific
                 regions to the programmer that are likely to have the
                 largest performance impact when parallelized. Kremlin
                 supports multiple planner personalities, which allow
                 the planner to more effectively target a particular
                 programming environment or class of machine.\par

                 We demonstrate the effectiveness of one such
                 personality, an OpenMP planner, by comparing versions
                 of programs that are parallelized according to
                 Kremlin's plan against third-party manually
                 parallelized versions. The results show that Kremlin's
                 OpenMP planner is highly effective, producing plans
                 whose performance is typically comparable to, and
                 sometimes much better than, manual parallelization. At
                 the same time, these plans would require that the user
                 parallelize significantly fewer regions of the
                 program.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sato:2011:APM,
  author =       "Shigeyuki Sato and Hideya Iwasaki",
  title =        "Automatic parallelization via matrix multiplication",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "470--479",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993554",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Udupa:2011:AEB,
  author =       "Abhishek Udupa and Kaushik Rajan and William Thies",
  title =        "{ALTER}: exploiting breakable dependences for
                 parallelization",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "480--491",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993555",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Srivastava:2011:PBI,
  author =       "Saurabh Srivastava and Sumit Gulwani and Swarat
                 Chaudhuri and Jeffrey S. Foster",
  title =        "Path-based inductive synthesis for program inversion",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "492--503",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993557",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Person:2011:DIS,
  author =       "Suzette Person and Guowei Yang and Neha Rungta and
                 Sarfraz Khurshid",
  title =        "Directed incremental symbolic execution",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "504--515",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993558",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{DElia:2011:MHC,
  author =       "Daniele Cono D'Elia and Camil Demetrescu and Irene
                 Finocchi",
  title =        "Mining hot calling contexts in small space",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "516--527",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993559",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kim:2011:VSC,
  author =       "Deokhwan Kim and Martin C. Rinard",
  title =        "Verification of semantic commutativity conditions and
                 inverse operations on linked data structures",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "528--541",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993561",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kulkarni:2011:ECL,
  author =       "Milind Kulkarni and Donald Nguyen and Dimitrios
                 Prountzos and Xin Sui and Keshav Pingali",
  title =        "Exploiting the commutativity lattice",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "542--555",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993562",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Perez:2011:SLS,
  author =       "Juan Antonio Navarro P{\'e}rez and Andrey
                 Rybalchenko",
  title =        "Separation logic $+$ superposition calculus $=$ heap
                 theorem prover",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "556--566",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993563",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dillig:2011:PCM,
  author =       "Isil Dillig and Thomas Dillig and Alex Aiken and Mooly
                 Sagiv",
  title =        "Precise and compact modular procedure summaries for
                 heap manipulating programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "567--577",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993565",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bouajjani:2011:IPA,
  author =       "Ahmed Bouajjani and Cezara Dragoi and Constantin Enea
                 and Mihaela Sighireanu",
  title =        "On inter-procedural analysis of programs with lists
                 and data",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "578--589",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993566",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liang:2011:SAR,
  author =       "Percy Liang and Mayur Naik",
  title =        "Scaling abstraction refinement via pruning",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "590--601",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993567",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Altidor:2011:TWC,
  author =       "John Altidor and Shan Shan Huang and Yannis
                 Smaragdakis",
  title =        "Taming the wildcards: combining definition- and
                 use-site variance",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "602--613",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tate:2011:TWJ,
  author =       "Ross Tate and Alan Leung and Sorin Lerner",
  title =        "Taming wildcards in {Java}'s type system",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "614--627",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993570",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ziarek:2011:CAE,
  author =       "Lukasz Ziarek and KC Sivaramakrishnan and Suresh
                 Jagannathan",
  title =        "Composable asynchronous events",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "628--639",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993572",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Best:2011:SST,
  author =       "Micah J. Best and Shane Mottishaw and Craig Mustard
                 and Mark Roth and Alexandra Fedorova and Andrew
                 Brownsword",
  title =        "Synchronization via scheduling: techniques for
                 efficiently managing shared state",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "6",
  pages =        "640--652",
  month =        jun,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/1993316.1993573",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 10:23:33 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bacon:2011:VAH,
  author =       "David F. Bacon",
  title =        "Virtualization in the age of heterogeneous machines",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "1--2",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952684",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Du:2011:PPV,
  author =       "Jiaqing Du and Nipun Sehrawat and Willy Zwaenepoel",
  title =        "Performance profiling of virtual machines",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "3--14",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952686",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nikolaev:2011:PXF,
  author =       "Ruslan Nikolaev and Godmar Back",
  title =        "{Perfctr-Xen}: a framework for performance counter
                 virtualization",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "15--26",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952687",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhao:2011:DCC,
  author =       "Qin Zhao and David Koh and Syed Raza and Derek
                 Bruening and Weng-Fai Wong and Saman Amarasinghe",
  title =        "Dynamic cache contention detection in multi-threaded
                 applications",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "27--38",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952688",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wang:2011:RVM,
  author =       "Kun Wang and Jia Rao and Cheng-Zhong Xu",
  title =        "Rethink the virtual machine template",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "39--50",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952690",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cecchet:2011:DVD,
  author =       "Emmanuel Cecchet and Rahul Singh and Upendra Sharma
                 and Prashant Shenoy",
  title =        "{Dolly}: virtualization-driven database provisioning
                 for the cloud",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "51--62",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952691",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Le:2011:REV,
  author =       "Michael Le and Yuval Tamir",
  title =        "{ReHype}: enabling {VM} survival across hypervisor
                 failures",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "63--74",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952692",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Park:2011:FSE,
  author =       "Eunbyung Park and Bernhard Egger and Jaejin Lee",
  title =        "Fast and space-efficient virtual machine
                 checkpointing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "75--86",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952694",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhang:2011:FRC,
  author =       "Irene Zhang and Alex Garthwaite and Yury Baskakov and
                 Kenneth C. Barr",
  title =        "Fast restore of checkpointed memory using working set
                 estimation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "87--98",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952695",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kourai:2011:FCP,
  author =       "Kenichi Kourai",
  title =        "Fast and correct performance recovery of operating
                 systems using a virtual machine monitor",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "99--110",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952696",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Svard:2011:EDC,
  author =       "Petter Sv{\"a}rd and Benoit Hudzia and Johan Tordsson
                 and Erik Elmroth",
  title =        "Evaluation of delta compression techniques for
                 efficient live migration of large virtual machines",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "111--120",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952698",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wood:2011:CDP,
  author =       "Timothy Wood and K. K. Ramakrishnan and Prashant
                 Shenoy and Jacobus van der Merwe",
  title =        "{CloudNet}: dynamic pooling of cloud resources by live
                 {WAN} migration of virtual machines",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "121--132",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952699",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zheng:2011:WAL,
  author =       "Jie Zheng and Tze Sing Eugene Ng and Kunwadee
                 Sripanidkulchai",
  title =        "Workload-aware live storage migration for clouds",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "133--144",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952700",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Litty:2011:PAI,
  author =       "Lionel Litty and David Lie",
  title =        "Patch auditing in infrastructure as a service clouds",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "145--156",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952702",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Payer:2011:FGU,
  author =       "Mathias Payer and Thomas R. Gross",
  title =        "Fine-grained user-space security through
                 virtualization",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "157--168",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952703",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lange:2011:MOV,
  author =       "John R. Lange and Kevin Pedretti and Peter Dinda and
                 Patrick G. Bridges and Chang Bae and Philip Soltero and
                 Alexander Merritt",
  title =        "Minimal-overhead virtualization of a large scale
                 supercomputer",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "169--180",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952705",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Xia:2011:VWB,
  author =       "Lei Xia and Sanjay Kumar and Xue Yang and Praveen
                 Gopalakrishnan and York Liu and Sebastian Schoenberg
                 and Xingang Guo",
  title =        "Virtual {WiFi}: bring virtualization from wired to
                 wireless",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "181--192",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952706",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lange:2011:SSV,
  author =       "John R. Lange and Peter Dinda",
  title =        "{SymCall}: symbiotic virtualization through
                 {VMM}-to-guest upcalls",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "193--204",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952707",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Williams:2011:OHM,
  author =       "Dan Williams and Hani Jamjoom and Yew-Huey Liu and
                 Hakim Weatherspoon",
  title =        "{Overdriver}: handling memory overload in an
                 oversubscribed cloud",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "205--216",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952709",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wang:2011:SHS,
  author =       "Xiaolin Wang and Jiarui Zang and Zhenlin Wang and
                 Yingwei Luo and Xiaoming Li",
  title =        "Selective hardware\slash software memory
                 virtualization",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "217--226",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952710",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Roy:2011:HBR,
  author =       "Amitabha Roy and Steven Hand and Tim Harris",
  title =        "Hybrid binary rewriting for memory access
                 instrumentation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "7",
  pages =        "227--238",
  month =        jul,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2007477.1952711",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 16 10:02:34 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Larus:2011:PC,
  author =       "James R. Larus",
  title =        "Programming the cloud",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "1--2",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941555",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Client + cloud computing is a disruptive, new
                 computing platform, combining diverse client devices
                 --- PCs, smartphones, sensors, and single-function and
                 embedded devices --- with the unlimited, on-demand
                 computation and data storage offered by cloud computing
                 services such as Amazon's AWS or Microsoft's Windows
                 Azure. As with every advance in computing, programming
                 is a fundamental challenge as client + cloud computing
                 combines many difficult aspects of software
                 development. Systems built for this world are
                 inherently parallel and distributed, run on unreliable
                 hardware, and must be continually available --- a
                 challenging programming model for even the most skilled
                 programmers. How then do ordinary programmers develop
                 software for the Cloud? This talk presents one answer,
                 Orleans, a software framework for building client +
                 cloud applications. Orleans encourages use of simple
                 concurrency patterns that are easy to understand and
                 implement correctly, building on an actor-like model
                 with declarative specification of persistence,
                 replication, and consistency and using lightweight
                 transactions to support the development of reliable and
                 scalable client + cloud software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hassaan:2011:OVU,
  author =       "Muhammad Amber Hassaan and Martin Burtscher and Keshav
                 Pingali",
  title =        "Ordered vs. unordered: a comparison of parallelism and
                 work-efficiency in irregular algorithms",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "3--12",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941557",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Outside of computational science, most problems are
                 formulated in terms of irregular data structures such
                 as graphs, trees and sets. Unfortunately, we understand
                 relatively little about the structure of parallelism
                 and locality in irregular algorithms. In this paper, we
                 study multiple algorithms for four such problems:
                 discrete-event simulation, single-source shortest path,
                 breadth-first search, and minimal spanning trees. We
                 show that the algorithms can be classified into two
                 categories that we call unordered and ordered, and
                 demonstrate experimentally that there is a trade-off
                 between parallelism and work efficiency: unordered
                 algorithms usually have more parallelism than their
                 ordered counterparts for the same problem, but they may
                 also perform more work. Nevertheless, our experimental
                 results show that unordered algorithms typically lead
                 to more scalable implementations, demonstrating that
                 less work-efficient irregular algorithms may be better
                 for parallel execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bauer:2011:PMH,
  author =       "Michael Bauer and John Clark and Eric Schkufza and
                 Alex Aiken",
  title =        "Programming the memory hierarchy revisited: supporting
                 irregular parallelism in {Sequoia}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "13--24",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941558",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We describe two novel constructs for programming
                 parallel machines with multi-level memory hierarchies:
                 call-up, which allows a child task to invoke
                 computation on its parent, and spawn, which spawns a
                 dynamically determined number of parallel children
                 until some termination condition in the parent is met.
                 Together we show that these constructs allow
                 applications with irregular parallelism to be
                 programmed in a straightforward manner, and furthermore
                 these constructs complement and can be combined with
                 constructs for expressing regular parallelism. We have
                 implemented spawn and call-up in Sequoia and we present
                 an experimental evaluation on a number of irregular
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Murarasu:2011:CDS,
  author =       "Alin Murarasu and Josef Weidendorfer and Gerrit Buse
                 and Daniel Butnaru and Dirk Pfl{\"u}ger",
  title =        "Compact data structure and scalable algorithms for the
                 sparse grid technique",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "25--34",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941559",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "The sparse grid discretization technique enables a
                 compressed representation of higher-dimensional
                 functions. In its original form, it relies heavily on
                 recursion and complex data structures, thus being far
                 from well-suited for GPUs. In this paper, we describe
                 optimizations that enable us to implement compression
                 and decompression, the crucial sparse grid algorithms
                 for our application, on Nvidia GPUs. The main idea
                 consists of a bijective mapping between the set of
                 points in a multi-dimensional sparse grid and a set of
                 consecutive natural numbers. The resulting data
                 structure consumes a minimum amount of memory. For a
                 10-dimensional sparse grid with approximately 127
                 million points, it consumes up to 30 times less memory
                 than trees or hash tables which are typically used.
                 Compared to a sequential CPU implementation, the
                 speedups achieved on GPU are up to 17 for compression
                 and up to 70 for decompression, respectively. We show
                 that the optimizations are also applicable to multicore
                 CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chafi:2011:DSA,
  author =       "Hassan Chafi and Arvind K. Sujeeth and Kevin J. Brown
                 and HyoukJoong Lee and Anand R. Atreya and Kunle
                 Olukotun",
  title =        "A domain-specific approach to heterogeneous
                 parallelism",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "35--46",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941561",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Exploiting heterogeneous parallel hardware currently
                 requires mapping application code to multiple disparate
                 programming models. Unfortunately, general-purpose
                 programming models available today can yield high
                 performance but are too low-level to be accessible to
                 the average programmer. We propose leveraging
                 domain-specific languages (DSLs) to map high-level
                 application code to heterogeneous devices. To
                 demonstrate the potential of this approach we present
                 OptiML, a DSL for machine learning. OptiML programs are
                 implicitly parallel and can achieve high performance on
                 heterogeneous hardware with no modification required to
                 the source code. For such a DSL-based approach to be
                 tractable at large scales, better tools are required
                 for DSL authors to simplify language creation and
                 parallelization. To address this concern, we introduce
                 Delite, a system designed specifically for DSLs that is
                 both a framework for creating an implicitly parallel
                 DSL as well as a dynamic runtime providing automated
                 targeting to heterogeneous parallel hardware. We show
                 that OptiML running on Delite achieves single-threaded,
                 parallel, and GPU performance superior to explicitly
                 parallelized MATLAB code in nearly all cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Catanzaro:2011:CCE,
  author =       "Bryan Catanzaro and Michael Garland and Kurt Keutzer",
  title =        "{Copperhead}: compiling an embedded data parallel
                 language",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "47--56",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941562",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Modern parallel microprocessors deliver high
                 performance on applications that expose substantial
                 fine-grained data parallelism. Although data
                 parallelism is widely available in many computations,
                 implementing data parallel algorithms in low-level
                 languages is often an unnecessarily difficult task. The
                 characteristics of parallel microprocessors and the
                 limitations of current programming methodologies
                 motivate our design of Copperhead, a high-level data
                 parallel language embedded in Python. The Copperhead
                 programmer describes parallel computations via
                 composition of familiar data parallel primitives
                 supporting both flat and nested data parallel
                 computation on arrays of data. Copperhead programs are
                 expressed in a subset of the widely used Python
                 programming language and interoperate with standard
                 Python modules, including libraries for numeric
                 computation, data visualization, and analysis. In this
                 paper, we discuss the language, compiler, and runtime
                 features that enable Copperhead to efficiently execute
                 data parallel code. We define the restricted subset of
                 Python which Copperhead supports and introduce the
                 program analysis techniques necessary for compiling
                 Copperhead code into efficient low-level
                 implementations. We also outline the runtime support by
                 which Copperhead programs interoperate with standard
                 Python modules. We demonstrate the effectiveness of our
                 techniques with several examples targeting the CUDA
                 platform for parallel programming on GPUs. Copperhead
                 code is concise, on average requiring 3.6 times fewer
                 lines of code than CUDA, and the compiler generates
                 efficient code, yielding 45-100\% of the performance of
                 hand-crafted, well optimized CUDA code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jenista:2011:OSO,
  author =       "James Christopher Jenista and Yong hun Eom and Brian
                 Charles Demsky",
  title =        "{OoOJava}: software out-of-order execution",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "57--68",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941563",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Developing parallel software using current tools can
                 be challenging. Even experts find it difficult to
                 reason about the use of locks and often accidentally
                 introduce race conditions and deadlocks into parallel
                 software. OoOJava is a compiler-assisted approach that
                 leverages developer annotations along with static
                 analysis to provide an easy-to-use deterministic
                 parallel programming model. OoOJava extends Java with a
                 task annotation that instructs the compiler to consider
                 a code block for out-of-order execution. OoOJava
                 executes tasks as soon as their data dependences are
                 resolved and guarantees that the execution of an
                 annotated program preserves the exact semantics of the
                 original sequential program. We have implemented
                 OoOJava and achieved an average speedup of 16.6x on our
                 ten benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Feng:2011:SSP,
  author =       "Min Feng and Rajiv Gupta and Yi Hu",
  title =        "{SpiceC}: scalable parallelism via implicit copying
                 and explicit commit",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "69--80",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941564",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "In this paper we present an approach to parallel
                 programming called SpiceC. SpiceC simplifies the task
                 of parallel programming through a combination of an
                 intuitive computation model and SpiceC directives. The
                 SpiceC parallel computation model consists of multiple
                 threads where every thread has a private space for data
                 and all threads share data via a shared space. Each
                 thread performs computations using its private space
                 thus offering isolation which allows for speculative
                 computations. SpiceC provides easy to use SpiceC
                 compiler directives using which the programmers can
                 express different forms of parallelism. It allows
                 developers to express high level constraints on data
                 transfers between spaces while the tedious task of
                 generating the code for the data transfers is performed
                 by the compiler. SpiceC also supports data transfers
                 involving dynamic data structures without help from
                 developers. SpiceC allows developers to create clusters
                 of data to enable parallel data transfers. SpiceC
                 programs are portable across modern chip multiprocessor
                 based machines that may or may not support cache
                 coherence. We have developed implementations of SpiceC
                 for shared memory systems with and without cache
                 coherence. We evaluate our implementation using seven
                 benchmarks of which four are parallelized
                 speculatively. Our compiler generated implementations
                 achieve speedups ranging from 2x to 18x on a 24 core
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Negara:2011:IOT,
  author =       "Stas Negara and Rajesh K. Karmani and Gul Agha",
  title =        "Inferring ownership transfer for efficient message
                 passing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "81--90",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941566",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "One of the more popular paradigms for concurrent
                 programming is the Actor model of message passing; it
                 has been adopted in one form or another by a number of
                 languages and frameworks. By avoiding a shared local
                 state and instead relying on message passing, the Actor
                 model facilitates modular programming. An important
                 challenge for message passing languages is to transmit
                 messages efficiently. This requires retaining the
                 pass-by-value semantics of messages while avoiding
                 making a deep copy on sequential or shared memory
                 multicore processors. A key observation is that many
                 messages have an ownership transfer semantics; such
                 messages can be sent efficiently using pointers without
                 introducing shared state between concurrent objects. We
                 propose a conservative static analysis algorithm which
                 infers if the content of a message is compatible with
                 an ownership transfer semantics. Our tool, called SOTER
                 (for Safe Ownership Transfer enablER) transforms the
                 program to avoid the cost of copying the contents of a
                 message whenever it can infer the content obeys the
                 ownership transfer semantics. Experiments using a range
                 of programs suggest that our conservative static
                 analysis method is usually able to infer ownership
                 transfer. Performance results demonstrate that the
                 transformed programs execute up to an order of
                 magnitude faster than the original programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Xiang:2011:AWP,
  author =       "Xiaoya Xiang and Bin Bao and Tongxin Bai and Chen Ding
                 and Trishul Chilimbi",
  title =        "All-window profiling and composable models of cache
                 sharing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "91--102",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941567",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "As multi-core processors become commonplace and cloud
                 computing is gaining acceptance, more applications are
                 run in a shared cache environment. Cache sharing
                 depends on a concept called footprint, which depends on
                 all cache accesses not just cache misses. Previous work
                 has recognized the importance of footprint but has not
                 provided a method for accurate measurement, mainly
                 because the complete measurement requires counting data
                 access in all execution windows, which takes time
                 quadratic in the length of a trace. The paper first
                 presents an algorithm efficient enough for off-line use
                 to approximately measure the footprint with a
                 guaranteed precision. The cost of the analysis can be
                 adjusted by changing the precision. Then the paper
                 presents a composable model. For a set of programs, the
                 model uses the all-window footprint of each program to
                 predict its cache interference with other programs
                 without running these programs together. The paper
                 evaluates the efficiency of all-window profiling using
                 the SPEC 2000 benchmarks and compares the footprint
                 interference model with a miss-rate based model and
                 with exhaustive testing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ding:2011:UUL,
  author =       "Xiaoning Ding and Kaibo Wang and Xiaodong Zhang",
  title =        "{ULCC}: a user-level facility for optimizing shared
                 cache performance on multicores",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "103--112",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941568",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Scientific applications face serious performance
                 challenges on multicore processors, one of which is
                 caused by access contention in last level shared caches
                 from multiple running threads. The contention increases
                 the number of long latency memory accesses, and
                 consequently increases application execution times.
                 Optimizing shared cache performance is critical to
                 reduce significantly execution times of multi-threaded
                 programs on multicores. However, there are two unique
                 problems to be solved before implementing cache
                 optimization techniques on multicores at the user
                 level. First, available cache space for each running
                 thread in a last level cache is difficult to predict
                 due to access contention in the shared space, which
                 makes cache conscious algorithms for single cores
                 ineffective on multicores. Second, at the user level,
                 programmers are not able to allocate cache space at
                 will to running threads in the shared cache, thus data
                 sets with strong locality may not be allocated with
                 sufficient cache space, and cache pollution can easily
                 happen. To address these two critical issues, we have
                 designed ULCC (User Level Cache Control), a software
                 runtime library that enables programmers to explicitly
                 manage and optimize last level cache usage by
                 allocating proper cache space for different data sets
                 of different threads. We have implemented ULCC at the
                 user level based on a page-coloring technique for last
                 level cache usage management. By means of multiple case
                 studies on an Intel multicore processor, we show that
                 with ULCC, scientific applications can achieve
                 significant performance improvements by fully
                 exploiting the benefit of cache optimization algorithms
                 and by partitioning the cache space accordingly to
                 protect frequently reused data sets and to avoid cache
                 pollution. Our experiments with various applications
                 show that ULCC can significantly improve application
                 performance by nearly 40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wu:2011:STB,
  author =       "Xing Wu and Frank Mueller",
  title =        "{ScalaExtrap}: trace-based communication extrapolation
                 for {SPMD} programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "113--122",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Performance modeling for scientific applications is
                 important for assessing potential application
                 performance and systems procurement in high-performance
                 computing (HPC). Recent progress on communication
                 tracing opens up novel opportunities for communication
                 modeling due to its lossless yet scalable trace
                 collection. Estimating the impact of scaling on
                 communication efficiency still remains non-trivial due
                 to execution-time variations and exposure to hardware
                 and software artifacts. This work contributes a
                 fundamentally novel modeling scheme. We synthetically
                 generate the application trace for large numbers of
                 nodes by extrapolation from a set of smaller traces. We
                 devise an innovative approach for topology
                 extrapolation of single program, multiple data (SPMD)
                 codes with stencil or mesh communication. The
                 extrapolated trace can subsequently be (a) replayed to
                 assess communication requirements before porting an
                 application, (b) transformed to auto-generate
                 communication benchmarks for various target platforms,
                 and (c) analyzed to detect communication inefficiencies
                 and scalability limitations. To the best of our
                 knowledge, rapidly obtaining the communication behavior
                 of parallel applications at arbitrary scale with the
                 availability of timed replay, yet without actual
                 execution of the application at this scale is without
                 precedence and has the potential to enable otherwise
                 infeasible system simulation at the exascale level.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{McKinley:2011:HPC,
  author =       "Kathryn S. McKinley",
  title =        "How's the parallel computing revolution going?",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "123--124",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941571",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Two trends changed the computing landscape over the
                 past decade: (1) hardware vendors started delivering
                 chip multiprocessors (CMPs) instead of uniprocessors,
                 and (2) software developers increasingly chose managed
                 languages instead of native languages. Unfortunately,
                 the former change is disrupting the virtuous-cycle
                 between performance improvements and software
                 innovation. Establishing a new parallel performance
                 virtuous cycle for managed languages will require
                 scalable applications executing on scalable Virtual
                 Machine (VM) services, since the VM schedules,
                 monitors, compiles, optimizes, garbage collects, and
                 executes together with the application. This talk
                 describes current progress, opportunities, and
                 challenges for scalable VM services. The parallel
                 computing revolution urgently needs more innovations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Karmani:2011:TCS,
  author =       "Rajesh K. Karmani and P. Madhusudan and Brandon M.
                 Moore",
  title =        "Thread contracts for safe parallelism",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "125--134",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941573",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We build a framework of thread contracts, called
                 Accord, that allows programmers to annotate their
                 concurrency co-ordination strategies. Accord
                 annotations allow programmers to declaratively specify
                 the parts of memory that a thread may read or write
                 into, and the locks that protect them, reflecting the
                 concurrency co-ordination among threads and the reason
                 why the program is free of data-races. We provide
                 automatic tools to check if the concurrency
                 co-ordination strategy ensures race-freedom, using
                 constraint-solvers (SMT solvers). Hence programmers
                 using Accord can both formally state and prove their
                 co-ordination strategies ensure race freedom. The
                 programmer's implementation of the co-ordination
                 strategy may however be correct or incorrect. We show
                 how the formal Accord contracts allow us to
                 automatically insert runtime assertions that serve to
                 check, during testing, whether the implementation
                 conforms to the contract. Using a large class of
                 data-parallel programs that share memory in intricate
                 ways, we show that natural and simple contracts suffice
                 to document the co-ordination strategy amongst threads,
                 and that the task of showing that the strategy ensures
                 race-freedom can be handled efficiently and
                 automatically by an existing SMT solver (Z3). While
                 co-ordination strategies can be proved race-free in our
                 framework, failure to prove the co-ordination strategy
                 race-free, accompanied by counter-examples produced by
                 the solver, indicates the presence of races. Using such
                 counterexamples, we report hitherto undiscovered
                 data-races that we found in the long-tested {\tt
                 applu\_l} benchmark in the Spec OMP2001 suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zheng:2011:GLO,
  author =       "Mai Zheng and Vignesh T. Ravi and Feng Qin and Gagan
                 Agrawal",
  title =        "{GRace}: a low-overhead mechanism for detecting data
                 races in {GPU} programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "135--146",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941574",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "In recent years, GPUs have emerged as an extremely
                 cost-effective means for achieving high performance.
                 Many application developers, including those with no
                 prior parallel programming experience, are now trying
                 to scale their applications using GPUs. While languages
                 like CUDA and OpenCL have eased GPU programming for
                 non-graphical applications, they are still explicitly
                 parallel languages. All parallel programmers,
                 particularly the novices, need tools that can help
                 ensuring the correctness of their programs. Like any
                 multithreaded environment, data races on GPUs can
                 severely affect the program reliability. Thus, tool
                 support for detecting race conditions can significantly
                 benefit GPU application developers. Existing approaches
                 for detecting data races on CPUs or GPUs have one or
                 more of the following limitations: (1) being ill-suited
                 for handling non-lock synchronization primitives on
                 GPUs; (2) lacking of scalability due to the state
                 explosion problem; (3) reporting many false positives
                 because of simplified modeling; and/or (4) incurring
                 prohibitive runtime and space overhead. In this paper,
                 we propose GRace, a new mechanism for detecting races
                 in GPU programs that combines static analysis with a
                 carefully designed dynamic checker for logging and
                 analyzing information at runtime. Our design utilizes
                 GPUs memory hierarchy to log runtime data accesses
                 efficiently. To improve the performance, GRace
                 leverages static analysis to reduce the number of
                 statements that need to be instrumented. Additionally,
                 by exploiting the knowledge of thread scheduling and
                 the execution model in the underlying GPUs, GRace can
                 accurately detect data races with no false positives
                 reported. Based on the above idea, we have built a
                 prototype of GRace with two schemes, i.e., GRace-stmt
                 and GRace-addr, for NVIDIA GPUs. Both schemes are
                 integrated with the same static analysis. We have
                 evaluated GRace-stmt and GRace-addr with three data
                 race bugs in three GPU kernel functions and also have
                 compared them with the existing approach, referred to
                 as B-tool. Our experimental results show that both
                 schemes of GRace are effective in detecting all
                 evaluated cases with no false positives, whereas Btool
                 reports many false positives for one evaluated case. On
                 the one hand, GRace-addr incurs low runtime overhead,
                 i.e., 22-116\%, and low space overhead, i.e., 9-18MB,
                 for the evaluated kernels. On the other hand,
                 GRace-stmt offers more help in diagnosing data races
                 with larger overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yi:2011:CRP,
  author =       "Jaeheon Yi and Caitlin Sadowski and Cormac Flanagan",
  title =        "Cooperative reasoning for preemptive execution",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "147--156",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941575",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We propose a cooperative methodology for multithreaded
                 software, where threads use traditional synchronization
                 idioms such as locks, but additionally document each
                 point of potential thread interference with a ``yield''
                 annotation. Under this methodology, code between two
                 successive yield annotations forms a serializable
                 transaction that is amenable to sequential reasoning.
                 This methodology reduces the burden of reasoning about
                 thread interleavings by indicating only those
                 interference points that matter. We present
                 experimental results showing that very few yield
                 annotations are required, typically one or two per
                 thousand lines of code. We also present dynamic
                 analysis algorithms for detecting cooperability
                 violations, where thread interference is not documented
                 by a yield, and for yield annotation inference for
                 legacy software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lesani:2011:CMT,
  author =       "Mohsen Lesani and Jens Palsberg",
  title =        "Communicating memory transactions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "157--168",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941577",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Many concurrent programming models enable both
                 transactional memory and message passing. For such
                 models, researchers have built increasingly efficient
                 implementations and defined reasonable correctness
                 criteria, while it remains an open problem to obtain
                 the best of both worlds. We present a programming model
                 that is the first to have opaque transactions, safe
                 asynchronous message passing, and an efficient
                 implementation. Our semantics uses tentative message
                 passing and keeps track of dependencies to enable undo
                 of message passing in case a transaction aborts. We can
                 program communication idioms such as barrier and
                 rendezvous that do not deadlock when used in an atomic
                 block. Our experiments show that our model adds little
                 overhead to pure transactions, and that it is
                 significantly more efficient than Transactional Events.
                 We use a novel definition of safe message passing that
                 may be of independent interest.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Luchangco:2011:TCE,
  author =       "Victor Luchangco and Virendra J. Marathe",
  title =        "Transaction communicators: enabling cooperation among
                 concurrent transactions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "169--178",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941578",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "In this paper, we propose to extend transactional
                 memory with transaction communicators, special objects
                 through which concurrent transactions can communicate:
                 changes by one transaction to a communicator can be
                 seen by concurrent transactions before the first
                 transaction commits. Although isolation of transactions
                 is compromised by such communication, we constrain the
                 effects of this compromise by tracking dependencies
                 among transactions, and preventing any transaction from
                 committing unless every transaction whose changes it
                 saw also commits. In particular, mutually dependent
                 transactions must commit or abort together, and
                 transactions that do not communicate remain isolated.
                 To help programmers synchronize accesses to
                 communicators, we also provide special
                 communicator-isolating transactions, which ensure
                 isolation even for accesses to communicators. We
                 propose language features to help programmers express
                 the communicator constructs. We implemented a novel
                 communicators-enabled STM runtime in the Maxine VM. Our
                 preliminary evaluation demonstrates that communicators
                 can be used in diverse settings to improve the
                 performance of transactional programs, and to empower
                 programmers with the ability to safely express within
                 transactions important programming idioms that
                 fundamentally require compromise of transaction
                 isolation (e.g., CSP-style synchronous
                 communication).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fernandes:2011:LFS,
  author =       "S{\'e}rgio Miguel Fernandes and Jo{\~a}o Cachopo",
  title =        "Lock-free and scalable multi-version software
                 transactional memory",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "179--188",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941579",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Software Transactional Memory (STM) was initially
                 proposed as a lock-free mechanism for concurrency
                 control. Early implementations had efficiency
                 limitations, and soon obstruction-free proposals
                 appeared, to tackle this problem, often simplifying STM
                 implementation. Today, most of the modern and
                 top-performing STMs use blocking designs, relying on
                 locks to ensure an atomic commit operation. This
                 approach has revealed better in practice, in part due
                 to its simplicity. Yet, it may have scalability
                 problems when we move into many-core computers,
                 requiring fine-tuning and careful programming to avoid
                 contention. In this paper we present and discuss the
                 modifications we made to a lock-based multi-version STM
                 in Java, to turn it into a lock-free implementation
                 that we have tested to scale at least up to 192 cores,
                 and which provides results that compete with, and
                 sometimes exceed, some of today's top-performing
                 lock-based implementations. The new lock-free commit
                 algorithm allows write transactions to proceed in
                 parallel, by allowing them to run their validation
                 phase independently of each other, and by resorting to
                 helping from threads that would otherwise be waiting to
                 commit, during the write-back phase. We also present a
                 new garbage collection algorithm to dispose of old
                 unused object versions that allows for asynchronous
                 identification of unnecessary versions, which minimizes
                 its interference with the rest of the transactional
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tian:2011:ESP,
  author =       "Chen Tian and Changhui Lin and Min Feng and Rajiv
                 Gupta",
  title =        "Enhanced speculative parallelization via incremental
                 recovery",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "189--200",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941580",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "The widespread availability of multicore systems has
                 led to an increased interest in speculative
                 parallelization of sequential programs using
                 software-based thread level speculation. Many of the
                 proposed techniques are implemented via state
                 separation where non-speculative computation state is
                 maintained separately from the speculative state of
                 threads performing speculative computations. If
                 speculation is successful, the results from speculative
                 state are committed to non-speculative state. However,
                 upon misspeculation, discard-all scheme is employed in
                 which speculatively computed results of a thread are
                 discarded and the computation is performed again. While
                 this scheme is simple to implement, one disadvantage of
                 discard-all is its inability to tolerate high
                 misspeculation rates due to its high runtime overhead.
                 Thus, it is not suitable for use in applications where
                 misspeculation rates are input dependent and therefore
                 may reach high levels. In this paper we develop an
                 approach for incremental recovery in which, instead of
                 discarding all of the results and reexecuting the
                 speculative computation in its entirety, the
                 computation is restarted from the earliest point at
                 which a misspeculation causing value is read. This
                 approach has two advantages. First, the cost of
                 recovery is reduced as only part of the computation is
                 reexecuted. Second, since recovery takes less time, the
                 likelihood of future misspeculations is reduced. We
                 design and implement a strategy for implementing
                 incremental recovery that allows results of partial
                 computations to be efficiently saved and reused. For a
                 set of programs where misspeculation rate is input
                 dependent, our experiments show that with inputs that
                 result in misspeculation rates of around 40\% and 80\%,
                 applying incremental recovery technique results in
                 1.2x-3.3x and 2.0x-6.6x speedups respectively over the
                 discard-all recovery scheme. Furthermore,
                 misspeculations observed during discard-all scheme are
                 reduced when incremental recovery is employed ---
                 reductions range from 10\% to 85\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Saraswat:2011:LBG,
  author =       "Vijay A. Saraswat and Prabhanjan Kambadur and Sreedhar
                 Kodali and David Grove and Sriram Krishnamoorthy",
  title =        "Lifeline-based global load balancing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "201--212",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941582",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "{On shared-memory systems, Cilk-style work-stealing
                 has been used to effectively parallelize irregular
                 task-graph based applications such as Unbalanced Tree
                 Search (UTS). There are two main difficulties in
                 extending this approach to distributed memory. In the
                 shared memory approach, thieves (nodes without work)
                 constantly attempt to asynchronously steal work from
                 randomly chosen victims until they find work. In
                 distributed memory, thieves cannot autonomously steal
                 work from a victim without disrupting its execution.
                 When work is sparse, this results in performance
                 degradation. In essence, a direct extension of
                 traditional work-stealing to distributed memory
                 violates the work-first principle underlying
                 work-stealing. Further, thieves spend useless CPU
                 cycles attacking victims that have no work, resulting
                 in system inefficiencies in multi-programmed contexts.
                 Second, it is non-trivial to detect active distributed
                 termination (detect that programs at all nodes are
                 looking for work, hence there is no work). This problem
                 is well-studied and requires careful design for good
                 performance. Unfortunately, in most existing
                 languages/frameworks, application developers are forced
                 to implement their own distributed termination
                 detection. In this paper, we develop a simple set of
                 ideas that allow work-stealing to be efficiently
                 extended to distributed memory. First, we introduce
                 lifeline graphs: low-degree, low-diameter, fully
                 connected directed graphs. Such graphs can be
                 constructed from k -dimensional hypercubes. When a node
                 is unable to find work after w unsuccessful steals, it
                 quiesces after informing the outgoing edges in its
                 lifeline graph. Quiescent nodes do not disturb other
                 nodes. A quiesced node is reactivated when work arrives
                 from a lifeline and itself shares this work with those
                 of its incoming lifelines that are activated.
                 Termination occurs precisely when computation at all
                 nodes has quiesced. In a language such as X10, such
                 passive distributed termination can be detected
                 automatically using the finish construct --- no
                 application code is necessary. Our design is
                 implemented in a few hundred lines of X10. On the
                 binomial tree described in Olivier:08}, the program
                 achieve 87\% efficiency on an Infiniband cluster of
                 1024 Power7 cores, with a peak throughput of 2.37
                 GNodes/sec. It achieves 87\% efficiency on a Blue
                 Gene/P with 2048 processors, and a peak throughput of
                 0.966 GNodes/s. All numbers are relative to single core
                 sequential performance. This implementation has been
                 refactored into a reusable global load balancing
                 framework. Applications can use this framework to
                 obtain global load balance with minimal code changes.
                 In summary, we claim: (a) the first formulation of UTS
                 that does not involve application level global
                 termination detection, (b) the introduction of lifeline
                 graphs to reduce failed steals (c) the demonstration of
                 simple lifeline graphs based on k-hypercubes, (d)
                 performance with superior efficiency (or the same
                 efficiency but over a wider range) than published
                 results on UTS. In particular, our framework can
                 deliver the same or better performance as an
                 unrestricted random work-stealing implementation, while
                 reducing the number of attempted steals.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wang:2011:CSP,
  author =       "Zhaoguo Wang and Ran Liu and Yufei Chen and Xi Wu and
                 Haibo Chen and Weihua Zhang and Binyu Zang",
  title =        "{COREMU}: a scalable and portable parallel full-system
                 emulator",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "213--222",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941583",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "This paper presents the open-source COREMU, a scalable
                 and portable parallel emulation framework that
                 decouples the complexity of parallelizing full-system
                 emulators from building a mature sequential one. The
                 key observation is that CPU cores and devices in
                 current (and likely future) multiprocessors are
                 loosely-coupled and communicate through well-defined
                 interfaces. Based on this observation, COREMU emulates
                 multiple cores by creating multiple instances of
                 existing sequential emulators, and uses a thin library
                 layer to handle the inter-core and device communication
                 and synchronization, to maintain a consistent view of
                 system resources. COREMU also incorporates lightweight
                 memory transactions, feedback-directed scheduling, lazy
                 code invalidation and adaptive signal control to
                 provide scalable performance. To make COREMU useful in
                 practice, we also provide some preliminary tools and
                 APIs that can help programmers to diagnose performance
                 problems and (concurrency) bugs. A working prototype,
                 which reuses the widely-used QEMU as the sequential
                 emulator, is with only 2500 lines of code (LOCs)
                 changes to QEMU. It currently supports x64 and ARM
                 platforms, and can emulates up to 255 cores running
                 commodity OSes with practical performance, while QEMU
                 cannot scale above 32 cores. A set of performance
                 evaluation against QEMU indicates that, COREMU has
                 negligible uniprocessor emulation overhead, performs
                 and scales significantly better than QEMU. We also show
                 how COREMU could be used to diagnose performance
                 problems and concurrency bugs of both OS kernel and
                 parallel applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kogan:2011:WFQ,
  author =       "Alex Kogan and Erez Petrank",
  title =        "Wait-free queues with multiple enqueuers and
                 dequeuers",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "223--234",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941585",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "The queue data structure is fundamental and
                 ubiquitous. Lock-free versions of the queue are well
                 known. However, an important open question is whether
                 practical wait-free queues exist. Until now, only
                 versions with limited concurrency were proposed. In
                 this paper we provide a design for a practical
                 wait-free queue. Our construction is based on the
                 highly efficient lock-free queue of Michael and Scott.
                 To achieve wait-freedom, we employ a priority-based
                 helping scheme in which faster threads help the slower
                 peers to complete their pending operations. We have
                 implemented our scheme on multicore machines and
                 present performance measurements comparing our
                 implementation with that of Michael and Scott in
                 several system configurations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tanase:2011:SPC,
  author =       "Gabriel Tanase and Antal Buss and Adam Fidel and
                 Harshvardhan Harshvardhan and Ioannis Papadopoulos and
                 Olga Pearce and Timmie Smith and Nathan Thomas and
                 Xiabing Xu and Nedal Mourad and Jeremy Vu and Mauro
                 Bianco and Nancy M. Amato and Lawrence Rauchwerger",
  title =        "The {STAPL} parallel container framework",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "235--246",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941586",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "The Standard Template Adaptive Parallel Library
                 (STAPL) is a parallel programming infrastructure that
                 extends C++ with support for parallelism. It includes a
                 collection of distributed data structures called
                 pContainers that are thread-safe, concurrent objects,
                 i.e., shared objects that provide parallel methods that
                 can be invoked concurrently. In this work, we present
                 the STAPL Parallel Container Framework (PCF), that is
                 designed to facilitate the development of generic
                 parallel containers. We introduce a set of concepts and
                 a methodology for assembling a pContainer from existing
                 sequential or parallel containers, without requiring
                 the programmer to deal with concurrency or data
                 distribution issues. The PCF provides a large number of
                 basic parallel data structures (e.g., pArray, pList,
                 pVector, pMatrix, pGraph, pMap, pSet). The PCF provides
                 a class hierarchy and a composition mechanism that
                 allows users to extend and customize the current
                 container base for improved application expressivity
                 and performance. We evaluate STAPL pContainer
                 performance on a CRAY XT4 massively parallel system and
                 show that pContainer methods, generic pAlgorithms, and
                 different applications provide good scalability on more
                 than 16,000 processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kourtis:2011:CEC,
  author =       "Kornilios Kourtis and Vasileios Karakasis and Georgios
                 Goumas and Nectarios Koziris",
  title =        "{CSX}: an extended compression format for {SpMV} on
                 shared memory systems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "247--256",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941587",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "The Sparse Matrix-Vector multiplication (SpMV) kernel
                 scales poorly on shared memory systems with multiple
                 processing units due to the streaming nature of its
                 data access pattern. Previous research has demonstrated
                 that an effective strategy to improve the kernel's
                 performance is to drastically reduce the data volume
                 involved in the computations. Since the storage formats
                 for sparse matrices include metadata describing the
                 structure of non-zero elements within the matrix, we
                 propose a generalized approach to compress metadata by
                 exploiting substructures within the matrix. We call the
                 proposed storage format Compressed Sparse eXtended
                 (CSX). In our implementation we employ runtime code
                 generation to construct specialized SpMV routines for
                 each matrix. Experimental evaluation on two shared
                 memory systems for 15 sparse matrices demonstrates
                 significant performance gains as the number of
                 participating cores increases. Regarding the cost of
                 CSX construction, we propose several strategies which
                 trade performance for preprocessing cost making CSX
                 applicable both to online and offline preprocessing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dotsenko:2011:ATF,
  author =       "Yuri Dotsenko and Sara S. Baghsorkhi and Brandon Lloyd
                 and Naga K. Govindaraju",
  title =        "Auto-tuning of {Fast Fourier Transform} on graphics
                 processors",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "257--266",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941589",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We present an auto-tuning framework for FFTs on
                 graphics processors (GPUs). Due to complex design of
                 the memory and compute subsystems on GPUs, the
                 performance of FFT kernels over the range of possible
                 input parameters can vary widely. We generate several
                 variants for each component of the FFT kernel that, for
                 different cases, are likely to perform well. Our
                 auto-tuner composes variants to generate kernels and
                 selects the best ones. We present heuristics to prune
                 the search space and profile only a small fraction of
                 all possible kernels. We compose optimized kernels to
                 improve the performance of larger FFT computations. We
                 implement the system using the NVIDIA CUDA API and
                 compare its performance to the state-of-the-art FFT
                 libraries. On a range of NVIDIA GPUs and input sizes,
                 our auto-tuned FFTs outperform the NVIDIA CUFFT 3.0
                 library by up to 38x and deliver up to 3x higher
                 performance compared to a manually-tuned FFT.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hong:2011:ACG,
  author =       "Sungpack Hong and Sang Kyun Kim and Tayo Oguntebi and
                 Kunle Olukotun",
  title =        "Accelerating {CUDA} graph algorithms at maximum warp",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "267--276",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941590",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Graphs are powerful data representations favored in
                 many computational domains. Modern GPUs have recently
                 shown promising results in accelerating computationally
                 challenging graph problems but their performance
                 suffered heavily when the graph structure is highly
                 irregular, as most real-world graphs tend to be. In
                 this study, we first observe that the poor performance
                 is caused by work imbalance and is an artifact of a
                 discrepancy between the GPU programming model and the
                 underlying GPU architecture.We then propose a novel
                 virtual warp-centric programming method that exposes
                 the traits of underlying GPU architectures to users.
                 Our method significantly improves the performance of
                 applications with heavily imbalanced workloads, and
                 enables trade-offs between workload imbalance and ALU
                 underutilization for fine-tuning the performance. Our
                 evaluation reveals that our method exhibits up to 9x
                 speedup over previous GPU algorithms and 12x over
                 single thread CPU execution on irregular graphs. When
                 properly configured, it also yields up to 30\%
                 improvement over previous GPU algorithms on regular
                 graphs. In addition to performance gains on graph
                 algorithms, our programming method achieves 1.3x to
                 15.1x speedup on a set of GPU benchmark applications.
                 Our study also confirms that the performance gap
                 between GPUs and other multi-threaded CPU graph
                 implementations is primarily due to the large
                 difference in memory bandwidth.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kim:2011:ASC,
  author =       "Jungwon Kim and Honggyu Kim and Joo Hwan Lee and
                 Jaejin Lee",
  title =        "Achieving a single compute device image in {OpenCL}
                 for multiple {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "277--288",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941591",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "In this paper, we propose an OpenCL framework that
                 combines multiple GPUs and treats them as a single
                 compute device. Providing a single virtual compute
                 device image to the user makes an OpenCL application
                 written for a single GPU portable to the platform that
                 has multiple GPU devices. It also makes the application
                 exploit full computing power of the multiple GPU
                 devices and the total amount of GPU memories available
                 in the platform. Our OpenCL framework automatically
                 distributes at run-time the OpenCL kernel written for a
                 single GPU into multiple CUDA kernels that execute on
                 the multiple GPU devices. It applies a run-time memory
                 access range analysis to the kernel by performing a
                 sampling run and identifies an optimal workload
                 distribution for the kernel. To achieve a single
                 compute device image, the runtime maintains virtual
                 device memory that is allocated in the main memory. The
                 OpenCL runtime treats the memory as if it were the
                 memory of a single GPU device and keeps it consistent
                 to the memories of the multiple GPU devices. Our
                 OpenCL-C-to-C translator generates the sampling code
                 from the OpenCL kernel code and OpenCL-C-to-CUDA-C
                 translator generates the CUDA kernel code for the
                 distributed OpenCL kernel. We show the effectiveness of
                 our OpenCL framework by implementing the OpenCL runtime
                 and two source-to-source translators. We evaluate its
                 performance with a system that contains 8 GPUs using 11
                 OpenCL benchmark applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Prabhakar:2011:QAS,
  author =       "Ramya Prabhakar and Shekhar Srikantaiah and Rajat Garg
                 and Mahmut Kandemir",
  title =        "{QoS} aware storage cache management in multi-server
                 environments",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "289--290",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941593",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "In this paper, we propose a novel two-step approach to
                 the management of the storage caches to provide
                 predictable performance in multi-server storage
                 architectures: (1) An adaptive QoS decomposition and
                 optimization step uses max-flow algorithm to determine
                 the best decomposition of application-level QoS to
                 sub-QoSs such that the application performance is
                 optimized, and (2) A storage cache allocation step uses
                 feedback control theory to allocate shared storage
                 cache space such that the specified QoSs are satisfied
                 throughout the execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Roy:2011:WAU,
  author =       "Amitabha Roy and Steven Hand and Tim Harris",
  title =        "Weak atomicity under the x86 memory consistency
                 model",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "291--292",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941594",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We consider the problem of building a weakly atomic
                 Software Transactional Memory (STM), that provides
                 Single (Global) Lock Atomicity (SLA) while adhering to
                 the x86 memory consistency model (x86-MM).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jeon:2011:KLG,
  author =       "Donghwan Jeon and Saturnino Garcia and Chris Louie and
                 Sravanthi Kota Venkata and Michael Bedford Taylor",
  title =        "{Kremlin}: like {\tt gprof}, but for parallelization",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "293--294",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941595",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "This paper overviews Kremlin, a software profiling
                 tool designed to assist the parallelization of serial
                 programs. Kremlin accepts a serial source code,
                 profiles it, and provides a list of regions that should
                 be considered in parallelization. Unlike a typical
                 profiler, Kremlin profiles not only work but also
                 parallelism, which is accomplished via a novel
                 technique called hierarchical critical path analysis.
                 Our evaluation demonstrates that Kremlin is highly
                 effective, resulting in a parallelized program whose
                 performance sometimes outperforms, and is mostly
                 comparable to, manual parallelization. At the same
                 time, Kremlin would require that the user parallelize
                 significantly fewer regions of the program. Finally, a
                 user study suggests Kremlin is effective in improving
                 the productivity of programmers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Strzodka:2011:TSM,
  author =       "Robert Strzodka and Mohammed Shaheen and Dawid Pajak",
  title =        "Time skewing made simple",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "295--296",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941596",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Time skewing and loop tiling has been known for a long
                 time to be a highly beneficial acceleration technique
                 for nested loops especially on bandwidth hungry
                 multi-core processors, but it is little used in
                 practice because efficient implementations utilize
                 complicated code and simple or abstract ones show much
                 smaller gains over naive nested loops. We break this
                 dilemma with an essential time skewing scheme that is
                 both compact and fast.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Grosset:2011:EGC,
  author =       "Andre Vincent Pascal Grosset and Peihong Zhu and
                 Shusen Liu and Suresh Venkatasubramanian and Mary
                 Hall",
  title =        "Evaluating graph coloring on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "297--298",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941597",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "This paper evaluates features of graph coloring
                 algorithms implemented on graphics processing units
                 (GPUs), comparing coloring heuristics and thread
                 decompositions. As compared to prior work on graph
                 coloring for other parallel architectures, we find that
                 the large number of cores and relatively high global
                 memory bandwidth of a GPU lead to different strategies
                 for the parallel implementation. Specifically, we find
                 that a simple uniform block partitioning is very
                 effective on GPUs and our parallel coloring heuristics
                 lead to the same or fewer colors than prior approaches
                 for distributed-memory cluster architecture. Our
                 algorithm resolves many coloring conflicts across
                 partitioned blocks on the GPU by iterating through the
                 coloring process, before returning to the CPU to
                 resolve remaining conflicts. With this approach we get
                 as few color (if not fewer) than the best sequential
                 graph coloring algorithm and performance is close to
                 the fastest sequential graph coloring algorithms which
                 have poor color quality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ding:2011:TEP,
  author =       "Chen Ding",
  title =        "Two examples of parallel programming without
                 concurrency constructs {(PP-CC)}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "299--300",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941598",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Stellwag:2011:WFN,
  author =       "Philippe Stellwag and Fabian Scheler and Jakob Krainz
                 and Wolfgang Schr{\"o}der-Preikschat",
  title =        "A wait-free {NCAS} library for parallel applications
                 with timing constraints",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "301--302",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941599",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We introduce our major ideas of a wait-free,
                 linearizable, and disjoint access parallel NCAS
                 library, called rtNCAS. It focuses the construction of
                 wait-free data structure operations (DSO) in real-time
                 circumstances. rtNCAS is able to conditionally swap
                 multiple independent words (NCAS) in an atomic manner.
                 It allows us, furthermore, to implement arbitrary DSO
                 by means of their sequential specification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Davies:2011:ABR,
  author =       "Teresa Davies and Zizhong Chen and Christer Karlsson
                 and Hui Liu",
  title =        "Algorithm-based recovery for {HPL}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "303--304",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941600",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "When more processors are used for a calculation, the
                 probability that one will fail during the calculation
                 increases. Fault tolerance is a technique for allowing
                 a calculation to survive a failure, and includes
                 recovering lost data. A common method of recovery is
                 diskless checkpointing. However, it has high overhead
                 when a large amount of data is involved, as is the case
                 with matrix operations. A checksum-based method allows
                 fault tolerance of matrix operations with lower
                 overhead. This technique is applicable to the LU
                 decomposition in the benchmark HPL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Willcock:2011:APP,
  author =       "Jeremiah James Willcock and Torsten Hoefler and
                 Nicholas Gerard Edmonds and Andrew Lumsdaine",
  title =        "{Active Pebbles}: a programming model for highly
                 parallel fine-grained data-driven computations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "305--306",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941601",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "A variety of programming models exist to support
                 large-scale, distributed memory, parallel computation.
                 These programming models have historically targeted
                 coarse-grained applications with natural locality such
                 as those found in a variety of scientific simulations
                 of the physical world. Fine-grained, irregular, and
                 unstructured applications such as those found in
                 biology, social network analysis, and graph theory are
                 less well supported. We propose Active Pebbles, a
                 programming model which allows these applications to be
                 expressed naturally; an accompanying execution model
                 ensures performance and scalability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fischer:2011:SMC,
  author =       "Topher Fischer and Eric Mercer and Neha Rungta",
  title =        "Symbolically modeling concurrent {MCAPI} executions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "307--308",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941602",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "Improper use of Inter-Process Communication (IPC)
                 within concurrent systems often creates data races
                 which can lead to bugs that are challenging to
                 discover. Techniques that use Satisfiability Modulo
                 Theories (SMT) problems to symbolically model possible
                 executions of concurrent software have recently been
                 proposed for use in the formal verification of
                 software. In this work we describe a new technique for
                 modeling executions of concurrent software that use a
                 message passing API called MCAPI. Our technique uses an
                 execution trace to create an SMT problem that
                 symbolically models all possible concurrent executions
                 and follows the same sequence of conditional branch
                 outcomes as the provided execution trace. We check if
                 there exists a satisfying assignment to the SMT problem
                 with respect to specific safety properties. If such an
                 assignment exists, it provides the conditions that lead
                 to the violation of the property. We show how our
                 method models behaviors of MCAPI applications that are
                 ignored in previously published techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Siegel:2011:AFV,
  author =       "Stephen F. Siegel and Timothy K. Zirkel",
  title =        "Automatic formal verification of {MPI}-based parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "309--310",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941603",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "The Toolkit for Accurate Scientific Software (TASS) is
                 a suite of tools for the formal verification of
                 MPI-based parallel programs used in computational
                 science. TASS can verify various safety properties as
                 well as compare two programs for functional
                 equivalence. The TASS front end takes an integer $ n
                 \geq 1 $ and a C/MPI program, and constructs an
                 abstract model of the program with $n$ processes.
                 Procedures, structs, (multi-dimensional) arrays,
                 heap-allocated data, pointers, and pointer arithmetic
                 are all representable in a TASS model. The model is
                 then explored using symbolic execution and explicit
                 state space enumeration. A number of techniques are
                 used to reduce the time and memory consumed. A variety
                 of realistic MPI programs have been verified with TASS,
                 including Jacobi iteration and manager-worker type
                 programs, and some subtle defects have been discovered.
                 TASS is written in Java and is available from
                 \path=http://vsl.cis.udel.edu/tass= under the Gnu
                 Public License.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Donaldson:2011:STA,
  author =       "Alastair F. Donaldson and Daniel Kroening and Philipp
                 Ruemmer",
  title =        "{SCRATCH}: a tool for automatic analysis of {DMA}
                 races",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "311--312",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941604",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We present the SCRATCH tool, which uses bounded model
                 checking and k-induction to automatically analyse
                 software for multicore processors such as the Cell BE,
                 in order to detect DMA races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Botincan:2011:ASP,
  author =       "Matko Botincan and Mike Dodds and Alastair F.
                 Donaldson and Matthew J. Parkinson",
  title =        "Automatic safety proofs for asynchronous memory
                 operations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "8",
  pages =        "313--314",
  month =        aug,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2038037.1941605",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 14:04:45 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '11 Conference proceedings.",
  abstract =     "We present a work-in-progress proof system and tool,
                 based on separation logic, for analysing memory safety
                 of multicore programs that use asynchronous memory
                 operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Filinski:2011:TCT,
  author =       "Andrzej Filinski",
  title =        "Towards a comprehensive theory of monadic effects",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "1--1",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034775",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gibbons:2011:JDI,
  author =       "Jeremy Gibbons and Ralf Hinze",
  title =        "Just do it: simple monadic equational reasoning",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "2--14",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034777",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Swamy:2011:LMP,
  author =       "Nikhil Swamy and Nataliya Guts and Daan Leijen and
                 Michael Hicks",
  title =        "Lightweight monadic programming in {ML}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "15--27",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034778",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mitchell:2011:FPT,
  author =       "Emily G. Mitchell",
  title =        "Functional programming through deep time: modeling the
                 first complex ecosystems on {Earth}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "28--31",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034779",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Schrijvers:2011:MZV,
  author =       "Tom Schrijvers and Bruno C. d. S. Oliveira",
  title =        "Monads, zippers and views: virtualizing the monad
                 stack",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "32--44",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034781",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Krishnaswami:2011:SMG,
  author =       "Neelakantan R. Krishnaswami and Nick Benton",
  title =        "A semantic model for graphical user interfaces",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "45--57",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034782",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shivers:2011:MRT,
  author =       "Olin Shivers and Aaron J. Turon",
  title =        "Modular rollback through control logging: a pair of
                 twin functional pearls",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "58--68",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034783",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Vardoulakis:2011:PFA,
  author =       "Dimitrios Vardoulakis and Olin Shivers",
  title =        "Pushdown flow analysis of first-class control",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "69--80",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034785",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Materzok:2011:SDC,
  author =       "Marek Materzok and Dariusz Biernacki",
  title =        "Subtyping delimited continuations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "81--93",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034786",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Castagna:2011:STF,
  author =       "Giuseppe Castagna and Zhiwu Xu",
  title =        "Set-theoretic foundation of parametric polymorphism
                 and subtyping",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "94--106",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034788",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gesbert:2011:PPS,
  author =       "Nils Gesbert and Pierre Genev{\`e}s and Nabil
                 Laya{\"\i}da",
  title =        "Parametric polymorphism and semantic subtyping: the
                 logical connection",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "107--116",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034789",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Morihata:2011:BTI,
  author =       "Akimasa Morihata and Kiminori Matsuzaki",
  title =        "Balanced trees inhabiting functional parallel
                 programming",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "117--128",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034791",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2011:ISA,
  author =       "Yan Chen and Joshua Dunfield and Matthew A. Hammer and
                 Umut A. Acar",
  title =        "Implicit self-adjusting computation for purely
                 functional programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "129--141",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034792",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Takeyama:2011:PAC,
  author =       "Makoto Takeyama",
  title =        "Programming assurance cases in {Agda}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "142--142",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034794",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Devriese:2011:BST,
  author =       "Dominique Devriese and Frank Piessens",
  title =        "On the bright side of type classes: instance arguments
                 in {Agda}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "143--155",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034796",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Magalhaes:2011:FMM,
  author =       "Jos{\'e} Pedro Magalh{\~a}es and W. Bas de Haas",
  title =        "Functional modelling of musical harmony: an experience
                 report",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "156--162",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034797",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gonthier:2011:HMA,
  author =       "Georges Gonthier and Beta Ziliani and Aleksandar
                 Nanevski and Derek Dreyer",
  title =        "How to make ad hoc proof automation less ad hoc",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "163--175",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034798",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Disney:2011:THO,
  author =       "Tim Disney and Cormac Flanagan and Jay McCarthy",
  title =        "Temporal higher-order contracts",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "176--188",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034800",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Might:2011:PDF,
  author =       "Matthew Might and David Darais and Daniel Spiewak",
  title =        "Parsing with derivatives: a functional pearl",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "189--195",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034801",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ueno:2011:ENM,
  author =       "Katsuhiro Ueno and Atsushi Ohori and Toshiaki Otomo",
  title =        "An efficient non-moving garbage collector for
                 functional languages",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "196--208",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034802",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gill:2011:DEF,
  author =       "Andy Gill and Andrew Farmer",
  title =        "Deriving an efficient {FPGA} implementation of a low
                 density parity check forward error corrector",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "209--220",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034804",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ghica:2011:GSIb,
  author =       "Dan R. Ghica and Alex Smith and Satnam Singh",
  title =        "Geometry of synthesis {IV}: compiling affine recursion
                 into static hardware",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "221--233",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034805",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ahn:2011:HMS,
  author =       "Ki Yung Ahn and Tim Sheard",
  title =        "A hierarchy of mendler style recursion combinators:
                 taming inductive datatypes with negative occurrences",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "234--246",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034807",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jay:2011:TSI,
  author =       "Barry Jay and Jens Palsberg",
  title =        "Typed self-interpretation by pattern matching",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "247--258",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034808",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chyzak:2011:UCP,
  author =       "Fr{\'e}d{\'e}ric Chyzak and Alexis Darrasse",
  title =        "Using {{\tt camlp4}} for presenting dynamic
                 mathematics on the {Web}: {DynaMoW}, an {OCaml}
                 language extension for the run-time generation of
                 mathematical contents and their presentation on the
                 {Web}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "259--265",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034809",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Swamy:2011:SDP,
  author =       "Nikhil Swamy and Juan Chen and C{\'e}dric Fournet and
                 Pierre-Yves Strub and Karthikeyan Bhargavan and Jean
                 Yang",
  title =        "Secure distributed programming with value-dependent
                 types",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "266--278",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034811",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Foster:2011:FNP,
  author =       "Nate Foster and Rob Harrison and Michael J. Freedman
                 and Christopher Monsanto and Jennifer Rexford and Alec
                 Story and David Walker",
  title =        "{Frenetic}: a network programming language",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "279--291",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034812",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fisher:2011:FLT,
  author =       "Kathleen Fisher and Nate Foster and David Walker and
                 Kenny Q. Zhu",
  title =        "{Forest}: a language and toolkit for programming with
                 filestores",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "292--306",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034814",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ohori:2011:MSM,
  author =       "Atsushi Ohori and Katsuhiro Ueno",
  title =        "Making {Standard ML} a practical database programming
                 language",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "307--319",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034815",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pouillard:2011:NP,
  author =       "Nicolas Pouillard",
  title =        "Nameless, painless",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "320--332",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034817",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Weirich:2011:BU,
  author =       "Stephanie Weirich and Brent A. Yorgey and Tim Sheard",
  title =        "Binders unbound",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "333--345",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034818",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Popescu:2011:RPS,
  author =       "Andrei Popescu and Elsa L. Gunter",
  title =        "Recursion principles for syntax with bindings and
                 substitution",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "346--358",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034819",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hinze:2011:PUF,
  author =       "Ralf Hinze and Daniel W. H. James",
  title =        "Proving the unique fixed-point principle correct: an
                 adventure with category theory",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "359--371",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034821",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gaboardi:2011:LPS,
  author =       "Marco Gaboardi and Luca Paolini and Mauro Piccolo",
  title =        "Linearity and {PCF}: a semantic insight!",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "372--384",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034822",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mu:2011:GDT,
  author =       "Shin-Cheng Mu and Akimasa Morihata",
  title =        "Generalising and dualising the third list-homomorphism
                 theorem: functional pearl",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "385--391",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034824",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wang:2011:IUE,
  author =       "Meng Wang and Jeremy Gibbons and Nicolas Wu",
  title =        "Incremental updates for efficient bidirectional
                 transformations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "392--403",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034825",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gotsman:2011:MVP,
  author =       "Alexey Gotsman and Hongseok Yang",
  title =        "Modular verification of preemptive {OS} kernels",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "404--417",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034827",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chargueraud:2011:CFV,
  author =       "Arthur Chargu{\'e}raud",
  title =        "Characteristic formulae for the verification of
                 imperative programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "418--430",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034828",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ahmed:2011:EPC,
  author =       "Amal Ahmed and Matthias Blume",
  title =        "An equivalence-preserving {CPS} translation via
                 multi-language semantics",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "431--444",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034830",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Thamsborg:2011:KLR,
  author =       "Jacob Thamsborg and Lars Birkedal",
  title =        "A {Kripke} logical relation for effect-based program
                 transformations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "9",
  pages =        "445--456",
  month =        sep,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2034574.2034831",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Oct 22 08:31:30 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ICFP '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sutherland:2011:SP,
  author =       "Ivan Sutherland",
  title =        "The sequential prison",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "1--2",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048068",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liu:2011:SPD,
  author =       "Tongping Liu and Emery D. Berger",
  title =        "{SHERIFF}: precise detection and automatic mitigation
                 of false sharing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "3--18",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048070",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Weeratunge:2011:APA,
  author =       "Dasarath Weeratunge and Xiangyu Zhang and Suresh
                 Jaganathan",
  title =        "Accentuating the positive: atomicity inference and
                 enforcement using correct executions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "19--34",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048071",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2011:SST,
  author =       "Du Li and Witawas Srisa-an and Matthew B. Dwyer",
  title =        "{SOS}: saving time in dynamic race detection with
                 stationary analysis",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "35--50",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048072",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shacham:2011:TAC,
  author =       "Ohad Shacham and Nathan Bronson and Alex Aiken and
                 Mooly Sagiv and Martin Vechev and Eran Yahav",
  title =        "Testing atomicity of composed concurrent operations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "51--64",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048073",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yessenov:2011:DDS,
  author =       "Kuat Yessenov and Zhilei Xu and Armando Solar-Lezama",
  title =        "Data-driven synthesis for object-oriented frameworks",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "65--82",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048075",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pu:2011:SFO,
  author =       "Yewen Pu and Rastislav Bodik and Saurabh Srivastava",
  title =        "Synthesis of first-order dynamic programming
                 algorithms",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "83--98",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048076",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Doherty:2011:KAM,
  author =       "Jesse Doherty and Laurie Hendren and Soroush Radpour",
  title =        "Kind analysis for {MATLAB}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "99--118",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048077",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Feldthaus:2011:TSR,
  author =       "Asger Feldthaus and Todd Millstein and Anders
                 M{\o}ller and Max Sch{\"a}fer and Frank Tip",
  title =        "Tool-supported refactoring for {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "119--138",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048078",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kats:2011:ILD,
  author =       "Lennart C. L. Kats and Rob Vermaas and Eelco Visser",
  title =        "Integrated language definition testing: enabling
                 test-driven language development",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "139--154",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048080",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jovic:2011:CMI,
  author =       "Milan Jovic and Andrea Adamoli and Matthias
                 Hauswirth",
  title =        "Catch me if you can: performance bug detection in the
                 wild",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "155--170",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048081",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Joshi:2011:PPT,
  author =       "Pallavi Joshi and Haryadi S. Gunawi and Koushik Sen",
  title =        "{PREFAIL}: a programmable tool for multiple-failure
                 injection",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "171--188",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048082",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Thummalapenta:2011:SMS,
  author =       "Suresh Thummalapenta and Tao Xie and Nikolai Tillmann
                 and Jonathan de Halleux and Zhendong Su",
  title =        "Synthesizing method sequences for high-coverage
                 testing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "189--206",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048083",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tripp:2011:HED,
  author =       "Omer Tripp and Greta Yorsh and John Field and Mooly
                 Sagiv",
  title =        "{HAWKEYE}: effective discovery of dataflow impediments
                 to parallelization",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "207--224",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048085",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Golan-Gueta:2011:AFG,
  author =       "Guy Golan-Gueta and Nathan Bronson and Alex Aiken and
                 G. Ramalingam and Mooly Sagiv and Eran Yahav",
  title =        "Automatic fine-grain locking using shape properties",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "225--242",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048086",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ke:2011:SPP,
  author =       "Chuanle Ke and Lei Liu and Chao Zhang and Tongxin Bai
                 and Bryan Jacobs and Chen Ding",
  title =        "Safe parallel programming using dynamic dependence
                 hints",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "243--258",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048087",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Raman:2011:SSP,
  author =       "Arun Raman and Greta Yorsh and Martin Vechev and Eran
                 Yahav",
  title =        "{Sprint}: speculative prefetching of remote data",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "259--274",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048088",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Aftandilian:2011:AA,
  author =       "Edward E. Aftandilian and Samuel Z. Guyer and Martin
                 Vechev and Eran Yahav",
  title =        "Asynchronous assertions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "275--288",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048090",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hoffman:2011:RPS,
  author =       "Kevin J. Hoffman and Harrison Metzger and Patrick
                 Eugster",
  title =        "{Ribbons}: a partially shared memory programming
                 model",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "289--306",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048091",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yang:2011:WNM,
  author =       "Xi Yang and Stephen M. Blackburn and Daniel Frampton
                 and Jennifer B. Sartor and Kathryn S. McKinley",
  title =        "Why nothing matters: the impact of zeroing",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "307--324",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048092",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Darulova:2011:TNC,
  author =       "Eva Darulova and Viktor Kuncak",
  title =        "Trustworthy numerical computation in {Scala}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "325--344",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048094",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2011:JEC,
  author =       "Siliang Li and Gang Tan",
  title =        "{JET}: exception checking in the {Java Native
                 Interface}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "345--358",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048095",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{David:2011:ISM,
  author =       "Cristina David and Wei-Ngan Chin",
  title =        "Immutable specifications for more concise and precise
                 verification",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "359--374",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048096",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shali:2011:HPE,
  author =       "Amin Shali and William R. Cook",
  title =        "Hybrid partial evaluation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "375--390",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048098",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Erdweg:2011:SLB,
  author =       "Sebastian Erdweg and Tillmann Rendel and Christian
                 K{\"a}stner and Klaus Ostermann",
  title =        "{SugarJ}: library-based syntactic language
                 extensibility",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "391--406",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048099",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Demetrescu:2011:RIP,
  author =       "Camil Demetrescu and Irene Finocchi and Andrea
                 Ribichini",
  title =        "Reactive imperative programming with dataflow
                 constraints",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "407--426",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048100",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Burckhardt:2011:TPO,
  author =       "Sebastian Burckhardt and Daan Leijen and Caitlin
                 Sadowski and Jaeheon Yi and Thomas Ball",
  title =        "Two for the price of one: a model for parallel and
                 incremental computation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "427--444",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048101",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tian:2011:STT,
  author =       "Kai Tian and Eddy Zhang and Xipeng Shen",
  title =        "A step towards transparent integration of
                 input-consciousness into dynamic program
                 optimizations",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "445--462",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048103",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jo:2011:ELR,
  author =       "Youngjoon Jo and Milind Kulkarni",
  title =        "Enhancing locality for recursive traversals of
                 recursive structures",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "463--482",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048104",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Adams:2011:FST,
  author =       "Michael D. Adams and Andrew W. Keep and Jan Midtgaard
                 and Matthew Might and Arun Chauhan and R. Kent Dybvig",
  title =        "Flow-sensitive type recovery in linear-log time",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "483--498",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048105",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Acar:2011:OSC,
  author =       "Umut A. Acar and Arthur Chargu{\'e}raud and Mike
                 Rainey",
  title =        "Oracle scheduling: controlling granularity in
                 implicitly parallel languages",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "499--518",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048106",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jeon:2011:KPS,
  author =       "Donghwan Jeon and Saturnino Garcia and Chris Louie and
                 Michael Bedford Taylor",
  title =        "{Kismet}: parallel speedup estimates for serial
                 programs",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "519--536",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048108",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cledat:2011:ESS,
  author =       "Romain E. Cledat and Tushar Kumar and Santosh Pande",
  title =        "Efficiently speeding up sequential computation through
                 the n-way programming model",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "537--554",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048109",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pyla:2011:ECG,
  author =       "Hari K. Pyla and Calvin Ribbens and Srinidhi
                 Varadarajan",
  title =        "Exploiting coarse-grain speculative parallelism",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "555--574",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048110",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Turon:2011:SJP,
  author =       "Aaron J. Turon and Claudio V. Russo",
  title =        "Scalable join patterns",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "575--594",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048111",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Delaware:2011:PLT,
  author =       "Benjamin Delaware and William Cook and Don Batory",
  title =        "Product lines of theorems",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "595--608",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048113",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ina:2011:GTG,
  author =       "Lintaro Ina and Atsushi Igarashi",
  title =        "Gradual typing for generics",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "609--624",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048114",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tov:2011:TST,
  author =       "Jesse A. Tov and Riccardo Pucella",
  title =        "A theory of substructural types and control",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "625--642",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048115",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Buse:2011:BBU,
  author =       "Raymond P. L. Buse and Caitlin Sadowski and Westley
                 Weimer",
  title =        "Benefits and barriers of user evaluation in software
                 engineering research",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "643--656",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048117",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sewe:2011:CCS,
  author =       "Andreas Sewe and Mira Mezini and Aibek Sarimbekov and
                 Walter Binder",
  title =        "Da capo con {Scala}: design and analysis of a {Scala}
                 benchmark suite for the {Java Virtual Machine}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "657--676",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048118",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Richards:2011:ACJ,
  author =       "Gregor Richards and Andreas Gal and Brendan Eich and
                 Jan Vitek",
  title =        "Automated construction of {JavaScript} benchmarks",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "677--694",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048119",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hemel:2011:DPM,
  author =       "Zef Hemel and Eelco Visser",
  title =        "Declaratively programming the {Mobile Web} with
                 {Mobl}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "695--712",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048121",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sunshine:2011:FCS,
  author =       "Joshua Sunshine and Karl Naden and Sven Stork and
                 Jonathan Aldrich and {\'E}ric Tanter",
  title =        "First-class state change in {Plaid}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "713--732",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048122",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lorenz:2011:CLL,
  author =       "David H. Lorenz and Boaz Rosenan",
  title =        "{Cedalion}: a language for language oriented
                 programming",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "733--752",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048123",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hammer:2011:SAS,
  author =       "Matthew A. Hammer and Georg Neis and Yan Chen and Umut
                 A. Acar",
  title =        "Self-adjusting stack machines",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "753--772",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048124",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kulkarni:2011:JCP,
  author =       "Prasad A. Kulkarni",
  title =        "{JIT} compilation policy for modern machines",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "773--788",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048126",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wu:2011:RTS,
  author =       "Peng Wu and Hiroshige Hayashizaki and Hiroshi Inoue
                 and Toshio Nakatani",
  title =        "Reducing trace selection footprint for large-scale
                 {Java} applications without performance loss",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "789--804",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048127",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kastner:2011:VAP,
  author =       "Christian K{\"a}stner and Paolo G. Giarrusso and
                 Tillmann Rendel and Sebastian Erdweg and Klaus
                 Ostermann and Thorsten Berger",
  title =        "Variability-aware parsing in the presence of lexical
                 macros and conditional compilation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "805--824",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048128",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wurthinger:2011:SAR,
  author =       "Thomas W{\"u}rthinger and Danilo Ansaloni and Walter
                 Binder and Christian Wimmer and Hanspeter
                 M{\"o}ssenb{\"o}ck",
  title =        "Safe and atomic run-time code evolution for {Java} and
                 its application to dynamic {AOP}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "825--844",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048129",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pinto:2011:SAC,
  author =       "Pedro da Rocha Pinto and Thomas Dinsdale-Young and
                 Mike Dodds and Philippa Gardner and Mark Wheelhouse",
  title =        "A simple abstraction for complex concurrent indexes",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "845--864",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048131",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Anderson:2011:CNP,
  author =       "Zachary Anderson and David Gay",
  title =        "Composable, nestable, pessimistic atomic statements",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "865--884",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048132",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lublinerman:2011:DI,
  author =       "Roberto Lublinerman and Jisheng Zhao and Zoran
                 Budimli{\'c} and Swarat Chaudhuri and Vivek Sarkar",
  title =        "Delegated isolation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "885--902",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048133",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Harris:2011:ACA,
  author =       "Tim Harris and Martin Abadi and Rebecca Isaacs and
                 Ross McIlroy",
  title =        "{AC}: composable asynchronous {IO} for native
                 languages",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "903--920",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048134",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Austin:2011:VVL,
  author =       "Thomas H. Austin and Tim Disney and Cormac Flanagan",
  title =        "Virtual values for language extension",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "921--938",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048136",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Palmer:2011:BJM,
  author =       "Zachary Palmer and Scott F. Smith",
  title =        "Backstage {Java}: making a difference in
                 metaprogramming",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "939--958",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048137",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Verwaest:2011:FOL,
  author =       "Toon Verwaest and Camillo Bruni and Mircea Lungu and
                 Oscar Nierstrasz",
  title =        "Flexible object layouts: enabling lightweight language
                 extensions by intercepting slot access",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "959--972",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048138",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Allen:2011:TCM,
  author =       "Eric Allen and Justin Hilburn and Scott Kilpatrick and
                 Victor Luchangco and Sukyoung Ryu and David Chase and
                 Guy Steele",
  title =        "Type checking modular multiple dispatch with
                 parametric polymorphism and multiple inheritance",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "973--992",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048140",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Im:2011:STS,
  author =       "Hyeonseung Im and Keiko Nakata and Jacques Garrigue
                 and Sungwoo Park",
  title =        "A syntactic type system for recursive modules",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "993--1012",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048141",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Summers:2011:FBC,
  author =       "Alexander J. Summers and Peter Mueller",
  title =        "Freedom before commitment: a lightweight type system
                 for object initialisation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "1013--1032",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048142",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Madhavan:2011:NDV,
  author =       "Ravichandhran Madhavan and Raghavan Komondoor",
  title =        "Null dereference verification via over-approximated
                 weakest pre-conditions analysis",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "1033--1052",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048144",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sridharan:2011:FTA,
  author =       "Manu Sridharan and Shay Artzi and Marco Pistoia and
                 Salvatore Guarnieri and Omer Tripp and Ryan Berg",
  title =        "{F4F}: taint analysis of framework-based {Web}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "1053--1068",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048145",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Son:2011:RFM,
  author =       "Sooel Son and Kathryn S. McKinley and Vitaly
                 Shmatikov",
  title =        "{RoleCast}: finding missing security checks when you
                 do not know what checks are",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "10",
  pages =        "1069--1084",
  month =        oct,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076021.2048146",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:53 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '11 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Veldema:2011:IDP,
  author =       "Ronald Veldema and Mich{\ae}l Philippsen",
  title =        "Iterative data-parallel mark\&sweep on a {GPU}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "1--10",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993480",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "Automatic memory management makes programming easier.
                 This is also true for general purpose GPU computing
                 where currently no garbage collectors exist. In this
                 paper we present a parallel mark-and-sweep collector to
                 collect GPU memory on the GPU and tune its performance.
                 Performance is increased by: (1) data-parallel marking
                 and sweeping of regions of memory, (2) marking all
                 elements of large arrays in parallel, (3) trading
                 recursion over parallelism to match deeply linked data
                 structures. (1) is achieved by coarsely processing all
                 potential objects in a region of memory in parallel.
                 When during (1) a large array is detected, it is put
                 aside and a parallel-for is later issued on the GPU to
                 mark its elements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Majo:2011:MMN,
  author =       "Zoltan Majo and Thomas R. Gross",
  title =        "Memory management in {NUMA} multicore systems: trapped
                 between cache contention and interconnect overhead",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "11--20",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993481",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "Multiprocessors based on processors with multiple
                 cores usually include a non-uniform memory architecture
                 (NUMA); even current 2-processor systems with 8 cores
                 exhibit non-uniform memory access times. As the cores
                 of a processor share a common cache, the issues of
                 memory management and process mapping must be
                 revisited. We find that optimizing only for data
                 locality can counteract the benefits of cache
                 contention avoidance and vice versa. Therefore, system
                 software must take both data locality and cache
                 contention into account to achieve good performance,
                 and memory management cannot be decoupled from process
                 scheduling. We present a detailed analysis of a
                 commercially available NUMA-multicore architecture, the
                 Intel Nehalem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Marlow:2011:MGC,
  author =       "Simon Marlow and Simon Peyton Jones",
  title =        "Multicore garbage collection with local heaps",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "21--32",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993482",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "In a parallel, shared-memory, language with a garbage
                 collected heap, it is desirable for each processor to
                 perform minor garbage collections independently.
                 Although obvious, it is difficult to make this idea pay
                 off in practice, especially in languages where mutation
                 is common. We present several techniques that
                 substantially improve the state of the art. We describe
                 these techniques in the context of a full-scale
                 implementation of Haskell, and demonstrate that our
                 local-heap collector substantially improves scaling,
                 peak performance, and robustness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Garner:2011:CEO,
  author =       "Robin J. Garner and Stephen M. Blackburn and Daniel
                 Frampton",
  title =        "A comprehensive evaluation of object scanning
                 techniques",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "33--42",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993484",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "At the heart of all garbage collectors lies the
                 process of identifying and processing reference fields
                 within an object. Despite its key role, and evidence of
                 many different implementation approaches, to our
                 knowledge no comprehensive quantitative study of this
                 design space exists. The lack of such a study means
                 that implementers must rely on `conventional wisdom',
                 hearsay, and their own costly analysis. Starting with
                 mechanisms described in the literature and a variety of
                 permutations of these, we explore the impact of a
                 number of dimensions including: (a) the choice of data
                 structure, (b) levels of indirection from object to
                 metadata, and (c) specialization of scanning code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gu:2011:TPL,
  author =       "Xiaoming Gu and Chen Ding",
  title =        "On the theory and potential of {LRU--MRU}
                 collaborative cache management",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "43--54",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993485",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "The goal of cache management is to maximize data
                 reuse. Collaborative caching provides an interface for
                 software to communicate access information to hardware.
                 In theory, it can obtain optimal cache performance. In
                 this paper, we study a collaborative caching system
                 that allows a program to choose different caching
                 methods for its data. As an interface, it may be used
                 in arbitrary ways, sometimes optimal but probably
                 suboptimal most times and even counter productive. We
                 develop a theoretical foundation for collaborative
                 caches to show the inclusion principle and the
                 existence of a distance metric we call LRU-MRU stack
                 distance. The new stack distance is important for
                 program analysis and transformation to target a
                 hierarchical collaborative cache system rather than a
                 single cache configuration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Afek:2011:CIA,
  author =       "Yehuda Afek and Dave Dice and Adam Morrison",
  title =        "Cache index-aware memory allocation",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "55--64",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993486",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "Poor placement of data blocks in memory may negatively
                 impact application performance because of an increase
                 in the cache conflict miss rate [18]. For dynamically
                 allocated structures this placement is typically
                 determined by the memory allocator. Cache
                 index-oblivious allocators may inadvertently place
                 blocks on a restricted fraction of the available cache
                 indexes, artificially and needlessly increasing the
                 conflict miss rate. While some allocators are less
                 vulnerable to this phenomena, no general-purpose malloc
                 allocator is index-aware and methodologically addresses
                 this concern. We demonstrate that many existing
                 state-of-the-art allocators are index-oblivious,
                 admitting performance pathologies for certain block
                 sizes. We show that a simple adjustment within the
                 allocator to control the spacing of blocks can provide
                 better index coverage, which in turn reduces the
                 superfluous conflict miss rate in various applications,
                 improving performance with no observed negative
                 consequ",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hertz:2011:WWR,
  author =       "Matthew Hertz and Stephen Kane and Elizabeth Keudel
                 and Tongxin Bai and Chen Ding and Xiaoming Gu and
                 Jonathan E. Bard",
  title =        "Waste not, want not: resource-based garbage collection
                 in a shared environment",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "65--76",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993487",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "To achieve optimal performance, garbage-collected
                 applications must balance the sizes of their heaps
                 dynamically. Sizing the heap too small can reduce
                 throughput by increasing the number of garbage
                 collections that must be performed. Too large a heap,
                 however, can cause the system to page and drag down the
                 overall throughput. In today's multicore,
                 multiprocessor machines, multiple garbage-collected
                 applications may run simultaneously. As a result, each
                 virtual machine (VM) must adjust its memory demands to
                 reflect not only the behavior of the application it is
                 running, but also the behavior of the peer applications
                 running on the system. We present a memory management
                 system that enables VMs to react to memory demands
                 dynamically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mutlu:2011:MSM,
  author =       "Onur Mutlu",
  title =        "Memory systems in the many-core era: challenges,
                 opportunities, and solution directions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "77--78",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993489",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "The memory subsystem is a fundamental performance and
                 energy bottleneck in almost all computing systems.
                 Recent trends towards increasingly more cores on die,
                 consolidation of diverse workloads on a single chip,
                 and difficulty of DRAM scaling impose new requirements
                 and exacerbate old demands on the memory system. In
                 particular, the need for memory bandwidth and capacity
                 is increasing [14], applications' interference in
                 memory system increasingly limits system performance
                 and makes the system hard to control [12], memory
                 energy and power are key design concerns [8], and DRAM
                 technology consumes significant amount of energy and
                 does not scale down easily to smaller technology nodes
                 [7].",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tene:2011:CCC,
  author =       "Gil Tene and Balaji Iyengar and Michael Wolf",
  title =        "{C4}: the continuously concurrent compacting
                 collector",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "79--88",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993491",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "C4, the Continuously Concurrent Compacting Collector,
                 an updated generational form of the Pauseless GC
                 Algorithm [7], is introduced and described, along with
                 details of its implementation on modern X86 hardware.
                 It uses a read barrier to support concur- rent
                 compaction, concurrent remapping, and concurrent
                 incremental update tracing. C4 differentiates itself
                 from other generational garbage collectors by
                 supporting simultaneous-generational concurrency: the
                 different generations are collected using concurrent
                 (non stop-the-world) mechanisms that can be
                 simultaneously and independently active. C4 is able to
                 continuously perform concurrent young generation
                 collections, even during long periods of concurrent
                 full heap collection, allowing C4 to sustain high
                 allocation rates and maintain the efficiency typical to
                 generational collectors, without sacrificing response
                 times or reverting to stop-the-world operation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kalibera:2011:HRO,
  author =       "Tomas Kalibera and Richard Jones",
  title =        "Handles revisited: optimising performance and memory
                 costs in a real-time collector",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "89--98",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993492",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "Compacting garbage collectors must update all
                 references to objects they move. Updating is a lengthy
                 operation but the updates must be transparent to the
                 mutator. The consequence is that no space can be
                 reclaimed until all references have been updated which,
                 in a real-time collector, must be done incrementally.
                 One solution is to replace direct references to objects
                 with handles. Handles offer several advantages to a
                 real-time collector. They eliminate the updating
                 problem. They allow immediate reuse of the space used
                 by evacuated objects. They incur no copy reserve
                 overhead. However, the execution time overhead of
                 handles has led to them being abandoned by most modern
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Aigner:2011:STM,
  author =       "Martin Aigner and Andreas Haas and Christoph M. Kirsch
                 and Michael Lippautz and Ana Sokolova and Stephanie
                 Stroka and Andreas Unterweger",
  title =        "Short-term memory for self-collecting mutators",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "99--108",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993493",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "We propose a new memory model called short-term memory
                 for managing objects on the heap. In contrast to the
                 traditional persistent memory model for heap
                 management, objects in short-term memory expire after a
                 finite amount of time, which makes deallocation
                 unnecessary. Instead, expiration of objects may be
                 extended, if necessary, by refreshing. We have
                 developed a concurrent, incremental, and non-moving
                 implementation of short-term memory for explicit
                 refreshing called self-collecting mutators that is
                 based on programmer-controlled time and integrated into
                 state-of-the-art runtimes of three programming
                 languages: C, Java, and Go. All memory management
                 operations run in constant time without acquiring any
                 locks modulo the underlying allocators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Singer:2011:GCA,
  author =       "Jeremy Singer and George Kovoor and Gavin Brown and
                 Mikel Luj{\'a}n",
  title =        "Garbage collection auto-tuning for {Java} {MapReduce}
                 on multi-cores",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "109--118",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993495",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "MapReduce has been widely accepted as a simple
                 programming pattern that can form the basis for
                 efficient, large-scale, distributed data processing.
                 The success of the MapReduce pattern has led to a
                 variety of implementations for different computational
                 scenarios. In this paper we present MRJ, a MapReduce
                 Java framework for multi-core architectures. We
                 evaluate its scalability on a four-core, hyperthreaded
                 Intel Core i7 processor, using a set of standard
                 MapReduce benchmarks. We investigate the significant
                 impact that Java runtime garbage collection has on the
                 performance and scalability of MRJ. We propose the use
                 of memory management auto-tuning techniques based on
                 machine learning.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wagner:2011:CMM,
  author =       "Gregor Wagner and Andreas Gal and Christian Wimmer and
                 Brendan Eich and Michael Franz",
  title =        "Compartmental memory management in a modern {Web}
                 browser",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "119--128",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993496",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "Since their inception, the usage pattern of web
                 browsers has changed substantially. Rather than
                 sequentially navigating static web sites, modern web
                 browsers often manage a large number of simultaneous
                 tabs displaying dynamic web content, each of which
                 might be running a substantial amount of client-side
                 JavaScript code. This environment introduced a new
                 degree of parallelism that was not fully embraced by
                 the underlying JavaScript virtual machine architecture.
                 We propose a novel abstraction for multiple disjoint
                 JavaScript heaps, which we call compartments. We use
                 the notion of document origin to cluster objects into
                 separate compartments. Objects within a compartment can
                 reference each other directly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tarau:2011:IST,
  author =       "Paul Tarau",
  title =        "Integrated symbol table, engine and heap memory
                 management in multi-engine {Prolog}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "11",
  pages =        "129--138",
  month =        nov,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2076022.1993497",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Dec 15 07:46:57 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '11 conference proceedings.",
  abstract =     "We describe an integrated solution to symbol, heap and
                 logic engine memory management in a context where
                 exchanges of arbitrary Prolog terms occur between
                 multiple dynamically created engines, implemented in a
                 new Java-based experimental Prolog system. As our
                 symbols represent not just Prolog atoms, but also
                 handles to Java objects (including arbitrary size
                 integers and decimals), everything is centered around a
                 symbol garbage collection algorithm ensuring that
                 external objects are shared and exchanged between logic
                 engines efficiently. Taking advantage of a tag-on-data
                 heap representation of Prolog terms, our algorithm
                 performs in-place updates of live symbol references
                 directly on heap cells.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Petricek:2011:EMP,
  author =       "Tomas Petricek and Alan Mycroft and Don Syme",
  title =        "Extending monads with pattern matching",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "1--12",
  month =        dec,
  year =         "2011",
  DOI =          "https://doi.org/10.1145/2096148.2034677",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Sequencing of effectful computations can be neatly
                 captured using monads and elegantly written using do
                 notation. In practice such monads often allow
                 additional ways of composing computations, which have
                 to be written explicitly using combinators. We identify
                 joinads, an abstract notion of computation that is
                 stronger than monads and captures many such ad-hoc
                 extensions. In particular, joinads are monads with
                 three additional operations: one of type $m a \to m b
                 \to m (a, b)$ captures various forms of parallel
                 composition, one of type $m a \to m a \to m a$ that is
                 inspired by choice and one of type $m a \to m (m a)$
                 that captures aliasing of computations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Giorgidze:2011:BBM,
  author =       "George Giorgidze and Torsten Grust and Nils
                 Schweinsberg and Jeroen Weijers",
  title =        "Bringing back monad comprehensions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "13--22",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034678",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper is about a Glasgow Haskell Compiler (GHC)
                 extension that generalises Haskell's list comprehension
                 notation to monads. The monad comprehension notation
                 implemented by the extension supports generator and
                 filter clauses, as was the case in the Haskell 1.4
                 standard. In addition, the extension generalises the
                 recently proposed parallel and SQL-like list
                 comprehension notations to monads. The aforementioned
                 generalisations are formally defined in this paper. The
                 extension will be available in GHC 7.2. This paper
                 gives several instructive examples that we hope will
                 facilitate wide adoption of the extension by the
                 Haskell community. We also argue why the do notation is
                 not always a good fit for monadic libraries and
                 embedded domain-specific languages, especially for
                 those that are based on collection monads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Bolingbroke:2011:TCF,
  author =       "Maximilian Bolingbroke and Simon Peyton Jones and
                 Dimitrios Vytiniotis",
  title =        "Termination combinators forever",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "23--34",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034680",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a library-based approach to constructing
                 termination tests suitable for controlling termination
                 of symbolic methods such as partial evaluation,
                 supercompilation and theorem proving. With our
                 combinators, all termination tests are correct by
                 construction. We show how the library can be designed
                 to embody various optimisations of the termination
                 tests, which the user of the library takes advantage of
                 entirely transparently.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Westbrook:2011:HHL,
  author =       "Edwin Westbrook and Nicolas Frisby and Paul Brauner",
  title =        "{Hobbits} for {Haskell}: a library for higher-order
                 encodings in functional programming languages",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "35--46",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034681",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Adequate encodings are a powerful programming tool,
                 which eliminate whole classes of program bugs: they
                 ensure that a program cannot generate ill-formed data,
                 because such data is not part of the representation;
                 and they also ensure that a program is well-defined,
                 meaning that it cannot have different behaviors on
                 different representations of the same piece of data.
                 Unfortunately, it has proven difficult to define
                 adequate encodings of programming languages themselves.
                 Such encodings would be very useful in language
                 processing tools such as interpreters, compilers,
                 model-checking tools, etc., as these systems are often
                 difficult to get correct. The key problem in
                 representing programming languages is in encoding
                 binding constructs; previous approaches have serious
                 limitations in either the operations they allow or the
                 correctness guarantees they make.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Harper:2011:LWG,
  author =       "Thomas Harper",
  title =        "A library writer's guide to shortcut fusion",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "47--58",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034682",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There are now a variety of shortcut fusion techniques
                 in the wild for removing intermediate data structures
                 in Haskell. They are often presented, however,
                 specialised to a specific data structure and interface.
                 This can make it difficult to transfer these techniques
                 to other settings. In this paper, we give a roadmap for
                 a library writer who would like to implement fusion for
                 his own library. We explain shortcut fusion without
                 reference to any specific implementation by treating it
                 as an instance of data refinement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Lippmeier:2011:EPS,
  author =       "Ben Lippmeier and Gabriele Keller",
  title =        "Efficient parallel stencil convolution in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "59--70",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034684",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Stencil convolution is a fundamental building block of
                 many scientific and image processing algorithms. We
                 present a declarative approach to writing such
                 convolutions in Haskell that is both efficient at
                 runtime and implicitly parallel. To achieve this we
                 extend our prior work on the Repa array library with
                 two new features: partitioned and cursored arrays.
                 Combined with careful management of the interaction
                 between GHC and its back-end code generator LLVM, we
                 achieve performance comparable to the standard OpenCV
                 library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Marlow:2011:MDP,
  author =       "Simon Marlow and Ryan Newton and Simon Peyton Jones",
  title =        "A monad for deterministic parallelism",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "71--82",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034685",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new programming model for deterministic
                 parallel computation in a pure functional language. The
                 model is monadic and has explicit granularity, but
                 allows dynamic construction of dataflow networks that
                 are scheduled at runtime, while remaining deterministic
                 and pure. The implementation is based on monadic
                 concurrency, which has until now only been used to
                 simulate concurrency in functional languages, rather
                 than to provide parallelism. We present the API with
                 its semantics, and argue that parallel execution is
                 deterministic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Leijen:2011:PCP,
  author =       "Daan Leijen and Manuel Fahndrich and Sebastian
                 Burckhardt",
  title =        "Prettier concurrency: purely functional concurrent
                 revisions",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "83--94",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034686",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This article presents an extension to the work of
                 Launchbury and Peyton-Jones on the ST monad. Using a
                 novel model for concurrency, called concurrent
                 revisions [3,5], we show how we can use concurrency
                 together with imperative mutable variables, while still
                 being able to safely convert such computations (in the
                 Rev monad) into pure values again. In contrast to many
                 other transaction models, like software transactional
                 memory (STM), concurrent revisions never use rollback
                 and always deterministically resolve conflicts. As a
                 consequence, concurrent revisions integrate well with
                 side-effecting I/O operations. Using deterministic
                 conflict resolution, concurrent revisions can deal well
                 with situations where there are many conflicts between
                 different threads that modify a shared data
                 structure.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Stefan:2011:FDI,
  author =       "Deian Stefan and Alejandro Russo and John C. Mitchell
                 and David Mazi{\`e}res",
  title =        "Flexible dynamic information flow control in
                 {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "95--106",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034688",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a new, dynamic, floating-label approach to
                 language-based information flow control, and present an
                 implementation in Haskell. A labeled IO monad, LIO,
                 keeps track of a current label and permits restricted
                 access to IO functionality, while ensuring that the
                 current label exceeds the labels of all data observed
                 and restricts what can be modified. Unlike other
                 language-based work, LIO also bounds the current label
                 with a current clearance that provides a form of
                 discretionary access control. In addition, programs may
                 encapsulate and pass around the results of computations
                 with different labels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Duregaard:2011:EPG,
  author =       "Jonas Dureg{\aa}rd and Patrik Jansson",
  title =        "Embedded parser generators",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "107--117",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034689",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a novel method of embedding context-free
                 grammars in Haskell, and to automatically generate
                 parsers and pretty-printers from them. We have
                 implemented this method in a library called BNFC-meta
                 (from the BNF Converter, which it is built on). The
                 library builds compiler front ends using
                 metaprogramming instead of conventional code
                 generation. Parsers are built from labelled BNF
                 grammars that are defined directly in Haskell modules.
                 Our solution combines features of parser generators
                 (static grammar checks, a highly specialised grammar
                 DSL) and adds several features that are otherwise
                 exclusive to combinatory libraries such as the ability
                 to reuse, parameterise and generate grammars inside
                 Haskell.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Epstein:2011:THC,
  author =       "Jeff Epstein and Andrew P. Black and Simon
                 Peyton-Jones",
  title =        "Towards {Haskell} in the cloud",
  journal =      j-SIGPLAN,
  volume =       "46",
  number =       "12",
  pages =        "118--129",
  month =        dec,
  year =         "2011",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2096148.2034690",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Jan 17 17:51:46 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Cloud Haskell, a domain-specific language
                 for developing programs for a distributed computing
                 environment. Implemented as a shallow embedding in
                 Haskell, it provides a message-passing communication
                 model, inspired by Erlang, without introducing
                 incompatibility with Haskell's established
                 shared-memory concurrency. A key contribution is a
                 method for serializing function closures for
                 transmission across the network. Cloud Haskell has been
                 implemented; we present example code and some
                 preliminary performance measurements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '11 conference proceedings.",
}

@Article{Black:2012:PSD,
  author =       "Andrew P. Black and Peter W. O'Hearn",
  title =        "Presentation of the {SIGPLAN} distinguished
                 achievement award to {Sir Charles Antony Richard Hoare,
                 FRS, FREng, FBCS}; and interview",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "1--2",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103658",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Hoare:2012:MTR,
  author =       "Tony Hoare",
  title =        "Message of thanks: on the receipt of the {2011 ACM
                 SIGPLAN} distinguished achievement award",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "3--6",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103659",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{vanStaden:2012:F,
  author =       "Stephan van Staden and Cristiano Calcagno and Bertrand
                 Meyer",
  title =        "Freefinement",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "7--18",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103661",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Freefinement is an algorithm that constructs a sound
                 refinement calculus from a verification system under
                 certain conditions. In this paper, a verification
                 system is any formal system for establishing whether an
                 inductively defined term, typically a program,
                 satisfies a specification. Examples of verification
                 systems include Hoare logics and type systems.
                 Freefinement first extends the term language to include
                 specification terms, and builds a verification system
                 for the extended language that is a sound and
                 conservative extension of the original system. The
                 extended system is then transformed into a sound
                 refinement calculus. The resulting refinement calculus
                 can interoperate closely with the verification system
                 --- it is even possible to reuse and translate proofs
                 between them. Freefinement gives a semantics to
                 refinement at an abstract level: it associates each
                 term of the extended language with a set of terms from
                 the original language, and refinement simply reduces
                 this set. The paper applies freefinement to a simple
                 type system for the lambda calculus and also to a Hoare
                 logic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Joshi:2012:UHI,
  author =       "Saurabh Joshi and Shuvendu K. Lahiri and Akash Lal",
  title =        "Underspecified harnesses and interleaved bugs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "19--30",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103662",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Static assertion checking of open programs requires
                 setting up a precise harness to capture the environment
                 assumptions. For instance, a library may require a file
                 handle to be properly initialized before it is passed
                 into it. A harness is used to set up or specify the
                 appropriate preconditions before invoking methods from
                 the program. In the absence of a precise harness, even
                 the most precise automated static checkers are bound to
                 report numerous false alarms. This often limits the
                 adoption of static assertion checking in the hands of a
                 user. In this work, we explore the possibility of
                 automatically filtering away (or prioritizing) warnings
                 that result from imprecision in the harness. We limit
                 our attention to the scenario when one is interested in
                 finding bugs due to concurrency. We define a warning to
                 be an interleaved bug when it manifests on an input for
                 which no sequential interleaving produces a warning. As
                 we argue in the paper, limiting a static analysis to
                 only consider interleaved bugs greatly reduces false
                 positives during static concurrency analysis in the
                 presence of an imprecise harness. We formalize
                 interleaved bugs as a differential analysis between the
                 original program and its sequential version and provide
                 various techniques for finding them. Our implementation
                 CBugs demonstrates that the scheme of finding
                 interleaved bugs can alleviate the need to construct
                 precise harnesses while checking real-life concurrent
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Gardner:2012:TPL,
  author =       "Philippa Anne Gardner and Sergio Maffeis and Gareth
                 David Smith",
  title =        "Towards a program logic for {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "31--44",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103663",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "JavaScript has become the most widely used language
                 for client-side web programming. The dynamic nature of
                 JavaScript makes understanding its code notoriously
                 difficult, leading to buggy programs and a lack of
                 adequate static-analysis tools. We believe that logical
                 reasoning has much to offer JavaScript: a simple
                 description of program behaviour, a clear understanding
                 of module boundaries, and the ability to verify
                 security contracts. We introduce a program logic for
                 reasoning about a broad subset of JavaScript, including
                 challenging features such as prototype inheritance and
                 `with'. We adapt ideas from separation logic to provide
                 tractable reasoning about JavaScript code: reasoning
                 about easy programs is easy; reasoning about hard
                 programs is possible. We prove a strong soundness
                 result. All libraries written in our subset and proved
                 correct with respect to their specifications will be
                 well-behaved, even when called by arbitrary JavaScript
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Krishnaswami:2012:HOF,
  author =       "Neelakantan R. Krishnaswami and Nick Benton and Jan
                 Hoffmann",
  title =        "Higher-order functional reactive programming in
                 bounded space",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "45--58",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103665",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional reactive programming (FRP) is an elegant
                 and successful approach to programming reactive systems
                 declaratively. The high levels of abstraction and
                 expressivity that make FRP attractive as a programming
                 model do, however, often lead to programs whose
                 resource usage is excessive and hard to predict. In
                 this paper, we address the problem of space leaks in
                 discrete-time functional reactive programs. We present
                 a functional reactive programming language that
                 statically bounds the size of the dataflow graph a
                 reactive program creates, while still permitting use of
                 higher-order functions and higher-type streams such as
                 streams of streams. We achieve this with a novel linear
                 type theory that both controls allocation and ensures
                 that all recursive definitions are well-founded. We
                 also give a denotational semantics for our language by
                 combining recent work on metric spaces for the
                 interpretation of higher-order causal functions with
                 length-space models of space-bounded computation. The
                 resulting category is doubly closed and hence forms a
                 model of the logic of bunched implications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Hur:2012:MBK,
  author =       "Chung-Kil Hur and Derek Dreyer and Georg Neis and
                 Viktor Vafeiadis",
  title =        "The marriage of bisimulations and {Kripke} logical
                 relations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "59--72",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103666",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There has been great progress in recent years on
                 developing effective techniques for reasoning about
                 program equivalence in ML-like languages---that is,
                 languages that combine features like higher-order
                 functions, recursive types, abstract types, and general
                 mutable references. Two of the most prominent types of
                 techniques to have emerged are *bisimulations* and
                 *Kripke logical relations (KLRs)*. While both
                 approaches are powerful, their complementary advantages
                 have led us and other researchers to wonder whether
                 there is an essential tradeoff between them.
                 Furthermore, both approaches seem to suffer from
                 fundamental limitations if one is interested in scaling
                 them to inter-language reasoning. In this paper, we
                 propose *relation transition systems (RTSs)*, which
                 marry together some of the most appealing aspects of
                 KLRs and bisimulations. In particular, RTSs show how
                 bisimulations' support for reasoning about recursive
                 features via *coinduction* can be synthesized with
                 KLRs' support for reasoning about local state via
                 *state transition systems*. Moreover, we have designed
                 RTSs to avoid the limitations of KLRs and bisimulations
                 that preclude their generalization to inter-language
                 reasoning. Notably, unlike KLRs, RTSs are transitively
                 composable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{James:2012:IE,
  author =       "Roshan P. James and Amr Sabry",
  title =        "Information effects",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "73--84",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103667",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computation is a physical process which, like all
                 other physical processes, is fundamentally reversible.
                 From the notion of type isomorphisms, we derive a
                 typed, universal, and reversible computational model in
                 which information is treated as a linear resource that
                 can neither be duplicated nor erased. We use this model
                 as a semantic foundation for computation and show that
                 the `gap' between conventional irreversible computation
                 and logically reversible computation can be captured by
                 a type-and-effect system. Our type-and-effect system is
                 structured as an arrow metalanguage that exposes
                 creation and erasure of information as explicit effect
                 operations. Irreversible computations arise from
                 interactions with an implicit information environment,
                 thus making them a derived notion, much like open
                 systems in Physics. We sketch several applications
                 which can benefit from an explicit treatment of
                 information effects, such as quantitative
                 information-flow security and differential privacy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Yang:2012:LAE,
  author =       "Jean Yang and Kuat Yessenov and Armando Solar-Lezama",
  title =        "A language for automatically enforcing privacy
                 policies",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "85--96",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103669",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is becoming increasingly important for applications
                 to protect sensitive data. With current techniques, the
                 programmer bears the burden of ensuring that the
                 application's behavior adheres to policies about where
                 sensitive values may flow. Unfortunately, privacy
                 policies are difficult to manage because their global
                 nature requires coordinated reasoning and enforcement.
                 To address this problem, we describe a programming
                 model that makes the system responsible for ensuring
                 adherence to privacy policies. The programming model
                 has two components: (1) core programs describing
                 functionality independent of privacy concerns and (2)
                 declarative, decentralized policies controlling how
                 sensitive values are disclosed. Each sensitive value
                 encapsulates multiple views; policies describe which
                 views are allowed based on the output context. The
                 system is responsible for automatically ensuring that
                 outputs are consistent with the policies. We have
                 implemented this programming model in a new functional
                 constraint language named Jeeves. In Jeeves, sensitive
                 values are introduced as symbolic variables and
                 policies correspond to constraints that are resolved at
                 output channels. We have implemented Jeeves as a Scala
                 library using an SMT solver as a model finder. In this
                 paper we describe the dynamic and static semantics of
                 Jeeves and the properties about policy enforcement that
                 the semantics guarantees. We also describe our
                 experience implementing a conference management system
                 and a social network.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Barthe:2012:PRR,
  author =       "Gilles Barthe and Boris K{\"o}pf and Federico Olmedo
                 and Santiago Zanella B{\'e}guelin",
  title =        "Probabilistic relational reasoning for differential
                 privacy",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "97--110",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103670",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Differential privacy is a notion of confidentiality
                 that protects the privacy of individuals while allowing
                 useful computations on their private data. Deriving
                 differential privacy guarantees for real programs is a
                 difficult and error-prone task that calls for
                 principled approaches and tool support. Approaches
                 based on linear types and static analysis have recently
                 emerged; however, an increasing number of programs
                 achieve privacy using techniques that cannot be
                 analyzed by these approaches. Examples include programs
                 that aim for weaker, approximate differential privacy
                 guarantees, programs that use the Exponential
                 mechanism, and randomized programs that achieve
                 differential privacy without using any standard
                 mechanism. Providing support for reasoning about the
                 privacy of such programs has been an open problem. We
                 report on CertiPriv, a machine-checked framework for
                 reasoning about differential privacy built on top of
                 the Coq proof assistant. The central component of
                 CertiPriv is a quantitative extension of a
                 probabilistic relational Hoare logic that enables one
                 to derive differential privacy guarantees for programs
                 from first principles. We demonstrate the
                 expressiveness of CertiPriv using a number of examples
                 whose formal analysis is out of the reach of previous
                 techniques. In particular, we provide the first
                 machine-checked proofs of correctness of the Laplacian
                 and Exponential mechanisms and of the privacy of
                 randomized and streaming algorithms from the recent
                 literature.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Heidegger:2012:APC,
  author =       "Phillip Heidegger and Annette Bieniusa and Peter
                 Thiemann",
  title =        "Access permission contracts for scripting languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "111--122",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103671",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The ideal software contract fully specifies the
                 behavior of an operation. Often, in particular in the
                 context of scripting languages, a full specification
                 may be cumbersome to state and may not even be desired.
                 In such cases, a partial specification, which describes
                 selected aspects of the behavior, may be used to raise
                 the confidence in an implementation of the operation to
                 a reasonable level. We propose a novel kind of contract
                 for object-based languages that specifies the side
                 effects of an operation with access permissions. An
                 access permission contract uses sets of access paths to
                 express read and write permissions for the properties
                 of the objects accessible from the operation. We
                 specify a monitoring semantics for access permission
                 contracts and implement this semantics in a contract
                 system for JavaScript. We prove soundness and stability
                 of violation under increasing aliasing for our
                 semantics. Applications of access permission contracts
                 include enforcing modularity, test-driven development,
                 program understanding, and regression testing. With
                 respect to testing and understanding, we find that
                 adding access permissions to contracts increases the
                 effectiveness of error detection through contract
                 monitoring by 6-13\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Madhusudan:2012:RPI,
  author =       "Parthasarathy Madhusudan and Xiaokang Qiu and Andrei
                 Stefanescu",
  title =        "Recursive proofs for inductive tree data-structures",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "123--136",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103673",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We develop logical mechanisms and procedures to
                 facilitate the verification of full functional
                 properties of inductive tree data-structures using
                 recursion that are sound, incomplete, but terminating.
                 Our contribution rests in a new extension of
                 first-order logic with recursive definitions called
                 Dryad, a syntactical restriction on pre- and
                 post-conditions of recursive imperative programs using
                 Dryad, and a systematic methodology for accurately
                 unfolding the footprint on the heap uncovered by the
                 program that leads to finding simple recursive proofs
                 using formula abstraction and calls to SMT solvers. We
                 evaluate our methodology empirically and show that
                 several complex tree data-structure algorithms can be
                 checked against full functional specifications
                 automatically, given pre- and post-conditions. This
                 results in the first automatic terminating methodology
                 for proving a wide variety of annotated algorithms on
                 tree data-structures correct, including max-heaps,
                 treaps, red-black trees, AVL trees, binomial heaps, and
                 B-trees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Veanes:2012:SFS,
  author =       "Margus Veanes and Pieter Hooimeijer and Benjamin
                 Livshits and David Molnar and Nikolaj Bjorner",
  title =        "Symbolic finite state transducers: algorithms and
                 applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "137--150",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103674",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Finite automata and finite transducers are used in a
                 wide range of applications in software engineering,
                 from regular expressions to specification languages. We
                 extend these classic objects with symbolic alphabets
                 represented as parametric theories. Admitting
                 potentially infinite alphabets makes this
                 representation strictly more general and succinct than
                 classical finite transducers and automata over strings.
                 Despite this, the main operations, including
                 composition, checking that a transducer is
                 single-valued, and equivalence checking for
                 single-valued symbolic finite transducers are effective
                 given a decision procedure for the background theory.
                 We provide novel algorithms for these operations and
                 extend composition to symbolic transducers augmented
                 with registers. Our base algorithms are unusual in that
                 they are nonconstructive, therefore, we also supply a
                 separate model generation algorithm that can quickly
                 find counterexamples in the case two symbolic finite
                 transducers are not equivalent. The algorithms give
                 rise to a complete decidable algebra of symbolic
                 transducers. Unlike previous work, we do not need any
                 syntactic restriction of the formulas on the
                 transitions, only a decision procedure. In practice we
                 leverage recent advances in satisfiability modulo
                 theory (SMT) solvers. We demonstrate our techniques on
                 four case studies, covering a wide range of
                 applications. Our techniques can synthesize string
                 pre-images in excess of 8,000 bytes in roughly a
                 minute, and we find that our new encodings
                 significantly outperform previous techniques in
                 succinctness and speed of analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Koksal:2012:CC,
  author =       "Ali Sinan K{\"o}ksal and Viktor Kuncak and Philippe
                 Suter",
  title =        "Constraints as control",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "151--164",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103675",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an extension of Scala that supports
                 constraint programming over bounded and unbounded
                 domains. The resulting language, Kaplan, provides the
                 benefits of constraint programming while preserving the
                 existing features of Scala. Kaplan integrates
                 constraint and imperative programming by using
                 constraints as an advanced control structure; the
                 developers use the monadic 'for' construct to iterate
                 over the solutions of constraints or branch on the
                 existence of a solution. The constructs we introduce
                 have simple semantics that can be understood as
                 explicit enumeration of values, but are implemented
                 more efficiently using symbolic reasoning. Kaplan
                 programs can manipulate constraints at run-time, with
                 the combined benefits of type-safe syntax trees and
                 first-class functions. The language of constraints is a
                 functional subset of Scala, supporting arbitrary
                 recursive function definitions over algebraic data
                 types, sets, maps, and integers. Our implementation
                 runs on a platform combining a constraint solver with a
                 standard virtual machine. For constraint solving we use
                 an algorithm that handles recursive function
                 definitions through fair function unrolling and builds
                 upon the state-of-the art SMT solver Z3. We evaluate
                 Kaplan on examples ranging from enumeration of data
                 structures to execution of declarative specifications.
                 We found Kaplan promising because it is expressive,
                 supporting a range of problem domains, while enabling
                 full-speed execution of programs that do not rely on
                 constraint programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Austin:2012:MFD,
  author =       "Thomas H. Austin and Cormac Flanagan",
  title =        "Multiple facets for dynamic information flow",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "165--178",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103677",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "JavaScript has become a central technology of the web,
                 but it is also the source of many security problems,
                 including cross-site scripting attacks and malicious
                 advertising code. Central to these problems is the fact
                 that code from untrusted sources runs with full
                 privileges. We implement information flow controls in
                 Firefox to help prevent violations of data
                 confidentiality and integrity. Most previous
                 information flow techniques have primarily relied on
                 either static type systems, which are a poor fit for
                 JavaScript, or on dynamic analyses that sometimes get
                 stuck due to problematic implicit flows, even in
                 situations where the target web application correctly
                 satisfies the desired security policy. We introduce
                 faceted values, a new mechanism for providing
                 information flow security in a dynamic manner that
                 overcomes these limitations. Taking inspiration from
                 secure multi-execution, we use faceted values to
                 simultaneously and efficiently simulate multiple
                 executions for different security levels, thus
                 providing non-interference with minimal overhead, and
                 without the reliance on the stuck executions of prior
                 dynamic approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Ray:2012:DCI,
  author =       "Donald Ray and Jay Ligatti",
  title =        "Defining code-injection attacks",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "179--190",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103678",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper shows that existing definitions of
                 code-injection attacks (e.g., SQL-injection attacks)
                 are flawed. The flaws make it possible for attackers to
                 circumvent existing mechanisms, by supplying
                 code-injecting inputs that are not recognized as such.
                 The flaws also make it possible for benign inputs to be
                 treated as attacks. After describing these flaws in
                 conventional definitions of code-injection attacks,
                 this paper proposes a new definition, which is based on
                 whether the symbols input to an application get used as
                 (normal-form) values in the application's output.
                 Because values are already fully evaluated, they cannot
                 be considered `code' when injected. This simple new
                 definition of code-injection attacks avoids the
                 problems of existing definitions, improves our
                 understanding of how and when such attacks occur, and
                 enables us to evaluate the effectiveness of mechanisms
                 for mitigating such attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Basu:2012:DCR,
  author =       "Samik Basu and Tevfik Bultan and Meriem Ouederni",
  title =        "Deciding choreography realizability",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "191--202",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103680",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Since software systems are becoming increasingly more
                 concurrent and distributed, modeling and analysis of
                 interactions among their components is a crucial
                 problem. In several application domains, message-based
                 communication is used as the interaction mechanism, and
                 the communication contract among the components of the
                 system is specified semantically as a state machine. In
                 the service-oriented computing domain such
                 communication contracts are called `choreography'
                 specifications. A choreography specification identifies
                 allowable ordering of message exchanges in a
                 distributed system. A fundamental question about a
                 choreography specification is determining its
                 realizability, i.e., given a choreography
                 specification, is it possible to build a distributed
                 system that communicates exactly as the choreography
                 specifies? Checking realizability of choreography
                 specifications has been an open problem for several
                 years and it was not known if this was a decidable
                 problem. In this paper we give necessary and sufficient
                 conditions for realizability of choreographies. We
                 implemented the proposed realizability check and our
                 experiments show that it can efficiently determine the
                 realizability of (1) web service choreographies, (2)
                 Singularity OS channel contracts, and (3) UML
                 collaboration (communication) diagrams.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Bouajjani:2012:ARP,
  author =       "Ahmed Bouajjani and Michael Emmi",
  title =        "Analysis of recursively parallel programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "203--214",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103681",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a general formal model of isolated
                 hierarchical parallel computations, and identify
                 several fragments to match the concurrency constructs
                 present in real-world programming languages such as
                 Cilk and X10. By associating fundamental formal models
                 (vector addition systems with recursive transitions) to
                 each fragment, we provide a common platform for
                 exposing the relative difficulties of algorithmic
                 reasoning. For each case we measure the complexity of
                 deciding state-reachability for finite-data recursive
                 programs, and propose algorithms for the decidable
                 cases. The complexities which include PTIME, NP,
                 EXPSPACE, and 2EXPTIME contrast with undecidable
                 state-reachability for recursive multi-threaded
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Rexford:2012:PLP,
  author =       "Jennifer Rexford",
  title =        "Programming languages for programmable networks",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "215--216",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103683",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today's computer networks perform a bewildering array
                 of tasks, from routing and access control, to traffic
                 monitoring and load balancing. To support wireless
                 users accessing services hosted in the cloud,
                 enterprise and data-center networks are under
                 increasing pressure to support client mobility,
                 virtual-machine migration, resource isolation between
                 cloud services, and energy-efficient operation. Yet,
                 network administrators must configure the network
                 through closed and proprietary interfaces to
                 heterogeneous devices, such as routers, switches,
                 firewalls, load balancers, network address translators,
                 and intrusion detection systems. Not surprisingly,
                 configuring these complex networks is expensive and
                 error-prone, and innovation in network management
                 proceeds at a snail's pace. During the past several
                 years, the networking industry and research community
                 have pushed for greater openness in networking
                 software, and a clearer separation between networking
                 devices and the software that controls them. This broad
                 trend is known as Software Defined Networking (SDN). A
                 hallmark of SDN is having an open interface for
                 controller software running on a commodity computer to
                 install packet-processing rules in the underlying
                 switches. In particular, the OpenFlow protocol (see
                 www.openflow.org) has significant momentum. Many
                 commercial switches support OpenFlow, and a number of
                 campus, data-center, and backbone networks have
                 deployed the new technology. With the emergence of open
                 interfaces to network devices, the time is ripe to
                 rethink the design of network software, to put
                 networking on a stronger foundation and foster
                 innovation in networked services. The programming
                 languages community can play a vital role in this
                 transformation, by creating languages, compilers,
                 run-time systems, and testing and verification
                 techniques that raise the level of abstraction for
                 programming the network. In this talk, we give an
                 overview of Software Defined Networking, and survey the
                 early programming-languages research in this area. We
                 also outline exciting opportunities for
                 interdisciplinary research at the intersection of
                 programming languages and computer networks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Monsanto:2012:CRT,
  author =       "Christopher Monsanto and Nate Foster and Rob Harrison
                 and David Walker",
  title =        "A compiler and run-time system for network programming
                 languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "217--230",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103685",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software-defined networks (SDNs) are a new kind of
                 network architecture in which a controller machine
                 manages a distributed collection of switches by
                 instructing them to install or uninstall
                 packet-forwarding rules and report traffic statistics.
                 The recently formed Open Networking Consortium, whose
                 members include Google, Facebook, Microsoft, Verizon,
                 and others, hopes to use this architecture to transform
                 the way that enterprise and data center networks are
                 implemented. In this paper, we define a high-level,
                 declarative language, called NetCore, for expressing
                 packet-forwarding policies on SDNs. NetCore is
                 expressive, compositional, and has a formal semantics.
                 To ensure that a majority of packets are processed
                 efficiently on switches---instead of on the
                 controller---we present new compilation algorithms for
                 NetCore and couple them with a new run-time system that
                 issues rule installation commands and
                 traffic-statistics queries to switches. Together, the
                 compiler and run-time system generate efficient rules
                 whenever possible and outperform the simple, manual
                 techniques commonly used to program SDNs today. In
                 addition, the algorithms we develop are generic,
                 assuming only that the packet-matching capabilities
                 available on switches satisfy some basic algebraic
                 laws. Overall, this paper delivers a new design for a
                 high-level network programming language; an improved
                 set of compiler algorithms; a new run-time system for
                 SDN architectures; the first formal semantics and
                 proofs of correctness in this domain; and an
                 implementation and evaluation that demonstrates the
                 performance benefits over traditional manual
                 techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Chugh:2012:NRL,
  author =       "Ravi Chugh and Patrick M. Rondon and Ranjit Jhala",
  title =        "Nested refinements: a logic for duck typing",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "231--244",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103686",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programs written in dynamic languages make heavy use
                 of features --- run-time type tests, value-indexed
                 dictionaries, polymorphism, and higher-order functions
                 --- that are beyond the reach of type systems that
                 employ either purely syntactic or purely semantic
                 reasoning. We present a core calculus, System D, that
                 merges these two modes of reasoning into a single
                 powerful mechanism of nested refinement types wherein
                 the typing relation is itself a predicate in the
                 refinement logic. System D coordinates SMT-based
                 logical implication and syntactic subtyping to
                 automatically typecheck sophisticated dynamic language
                 programs. By coupling nested refinements with
                 McCarthy's theory of finite maps, System D can
                 precisely reason about the interaction of higher-order
                 functions, polymorphism, and dictionaries. The addition
                 of type predicates to the refinement logic creates a
                 circularity that leads to unique technical challenges
                 in the metatheory, which we solve with a novel
                 stratification approach that we use to prove the
                 soundness of System D.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Cousot:2012:AIFa,
  author =       "Patrick Cousot and Radhia Cousot",
  title =        "An abstract interpretation framework for termination",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "245--258",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103687",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Proof, verification and analysis methods for
                 termination all rely on two induction principles: (1) a
                 variant function or induction on data ensuring progress
                 towards the end and (2) some form of induction on the
                 program structure. The abstract interpretation design
                 principle is first illustrated for the design of new
                 forward and backward proof, verification and analysis
                 methods for safety. The safety collecting semantics
                 defining the strongest safety property of programs is
                 first expressed in a constructive fixpoint form. Safety
                 proof and checking/verification methods then
                 immediately follow by fixpoint induction. Static
                 analysis of abstract safety properties such as
                 invariance are constructively designed by fixpoint
                 abstraction (or approximation) to (automatically) infer
                 safety properties. So far, no such clear design
                 principle did exist for termination so that the
                 existing approaches are scattered and largely not
                 comparable with each other. For (1), we show that this
                 design principle applies equally well to potential and
                 definite termination. The trace-based termination
                 collecting semantics is given a fixpoint definition.
                 Its abstraction yields a fixpoint definition of the
                 best variant function. By further abstraction of this
                 best variant function, we derive the Floyd/Turing
                 termination proof method as well as new static analysis
                 methods to effectively compute approximations of this
                 best variant function. For (2), we introduce a
                 generalization of the syntactic notion of structural
                 induction (as found in Hoare logic) into a semantic
                 structural induction based on the new semantic concept
                 of inductive trace cover covering execution traces by
                 segments, a new basis for formulating program
                 properties. Its abstractions allow for generalized
                 recursive proof, verification and static analysis
                 methods by induction on both program structure,
                 control, and data. Examples of particular instances
                 include Floyd's handling of loop cutpoints as well as
                 nested loops, Burstall's intermittent assertion total
                 correctness proof method, and Podelski-Rybalchenko
                 transition invariants.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Hoder:2012:PGA,
  author =       "Krystof Hoder and Laura Kovacs and Andrei Voronkov",
  title =        "Playing in the grey area of proofs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "259--272",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103689",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Interpolation is an important technique in
                 verification and static analysis of programs. In
                 particular, interpolants extracted from proofs of
                 various properties are used in invariant generation and
                 bounded model checking. A number of recent papers
                 studies interpolation in various theories and also
                 extraction of smaller interpolants from proofs. In
                 particular, there are several algorithms for extracting
                 of interpolants from so-called local proofs. The main
                 contribution of this paper is a technique of minimising
                 interpolants based on transformations of what we call
                 the `grey area' of local proofs. Another contribution
                 is a technique of transforming, under certain common
                 conditions, arbitrary proofs into local ones. Unlike
                 many other interpolation techniques, our technique is
                 very general and applies to arbitrary theories. Our
                 approach is implemented in the theorem prover Vampire
                 and evaluated on a large number of benchmarks coming
                 from first-order theorem proving and bounded model
                 checking using logic with equality, uninterpreted
                 functions and linear integer arithmetic. Our
                 experiments demonstrate the power of the new
                 techniques: for example, it is not unusual that our
                 proof transformation gives more than a tenfold
                 reduction in the size of interpolants.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Stampoulis:2012:SUE,
  author =       "Antonis Stampoulis and Zhong Shao",
  title =        "Static and user-extensible proof checking",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "273--284",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103690",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite recent successes, large-scale proof
                 development within proof assistants remains an arcane
                 art that is extremely time-consuming. We argue that
                 this can be attributed to two profound shortcomings in
                 the architecture of modern proof assistants. The first
                 is that proofs need to include a large amount of minute
                 detail; this is due to the rigidity of the proof
                 checking process, which cannot be extended with
                 domain-specific knowledge. In order to avoid these
                 details, we rely on developing and using tactics,
                 specialized procedures that produce proofs.
                 Unfortunately, tactics are both hard to write and hard
                 to use, revealing the second shortcoming of modern
                 proof assistants. This is because there is no static
                 knowledge about their expected use and behavior. As has
                 recently been demonstrated, languages that allow
                 type-safe manipulation of proofs, like Beluga, Delphin
                 and VeriML, can be used to partly mitigate this second
                 issue, by assigning rich types to tactics. Still, the
                 architectural issues remain. In this paper, we build on
                 this existing work, and demonstrate two novel ideas: an
                 extensible conversion rule and support for static proof
                 scripts. Together, these ideas enable us to support
                 both user-extensible proof checking, and sophisticated
                 static checking of tactics, leading to a new point in
                 the design space of future proof assistants. Both ideas
                 are based on the interplay between a light-weight
                 staging construct and the rich type information
                 available.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Klein:2012:RYR,
  author =       "Casey Klein and John Clements and Christos Dimoulas
                 and Carl Eastlund and Matthias Felleisen and Matthew
                 Flatt and Jay A. McCarthy and Jon Rafkind and Sam
                 Tobin-Hochstadt and Robert Bruce Findler",
  title =        "Run your research: on the effectiveness of lightweight
                 mechanization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "285--296",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103691",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Formal models serve in many roles in the programming
                 language community. In its primary role, a model
                 communicates the idea of a language design; the
                 architecture of a language tool; or the essence of a
                 program analysis. No matter which role it plays,
                 however, a faulty model doesn't serve its purpose. One
                 way to eliminate flaws from a model is to write it down
                 in a mechanized formal language. It is then possible to
                 state theorems about the model, to prove them, and to
                 check the proofs. Over the past nine years, PLT has
                 developed and explored a lightweight version of this
                 approach, dubbed Redex. In a nutshell, Redex is a
                 domain-specific language for semantic models that is
                 embedded in the Racket programming language. The effort
                 of creating a model in Redex is often no more
                 burdensome than typesetting it with LaTeX; the
                 difference is that Redex comes with tools for the
                 semantics engineering life cycle.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Farzan:2012:VPC,
  author =       "Azadeh Farzan and Zachary Kincaid",
  title =        "Verification of parameterized concurrent programs by
                 modular reasoning about data and control",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "297--308",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103693",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we consider the problem of verifying
                 thread-state properties of multithreaded programs in
                 which the number of active threads cannot be statically
                 bounded. Our approach is based on decomposing the task
                 into two modules, where one reasons about data and the
                 other reasons about control. The data module computes
                 thread-state invariants (e.g., linear constraints over
                 global variables and local variables of one thread)
                 using the thread interference information computed by
                 the control module. The control module computes a
                 representation of thread interference, as an
                 incrementally constructed data flow graph, using the
                 data invariants provided by the data module. These
                 invariants are used to rule out patterns of thread
                 interference that can not occur in a real program
                 execution. The two modules are incorporated into a
                 feedback loop, so that the abstractions of data and
                 interference are iteratively coarsened as the algorithm
                 progresses (that is, they become weaker) until a fixed
                 point is reached. Our approach is sound and
                 terminating, and applicable to programs with infinite
                 state (e.g., unbounded integers) and unboundedly many
                 threads. The verification method presented in this
                 paper has been implemented into a tool, called Duet. We
                 demonstrate the effectiveness of our technique by
                 verifying properties of a selection of Linux device
                 drivers using Duet, and also compare Duet with previous
                 work on verification of parameterized Boolean program
                 using the Boolean abstractions of these drivers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Botincan:2012:RSS,
  author =       "Matko Botincan and Mike Dodds and Suresh Jagannathan",
  title =        "Resource-sensitive synchronization inference by
                 abduction",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "309--322",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103694",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an analysis which takes as its input a
                 sequential program, augmented with annotations
                 indicating potential parallelization opportunities, and
                 a sequential proof, written in separation logic, and
                 produces a correctly-synchronized parallelized program
                 and proof of that program. Unlike previous work, ours
                 is not an independence analysis; we insert
                 synchronization constructs to preserve relevant
                 dependencies found in the sequential program that may
                 otherwise be violated by a naive translation.
                 Separation logic allows us to parallelize fine-grained
                 patterns of resource-usage, moving beyond
                 straightforward points-to analysis. Our analysis works
                 by using the sequential proof to discover dependencies
                 between different parts of the program. It leverages
                 these discovered dependencies to guide the insertion of
                 synchronization primitives into the parallelized
                 program, and to ensure that the resulting parallelized
                 program satisfies the same specification as the
                 original sequential program, and exhibits the same
                 sequential behaviour. Our analysis is built using frame
                 inference and abduction, two techniques supported by an
                 increasing number of separation logic tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Reddy:2012:SCI,
  author =       "Uday S. Reddy and John C. Reynolds",
  title =        "Syntactic control of interference for separation
                 logic",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "323--336",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103695",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Separation Logic has witnessed tremendous success in
                 recent years in reasoning about programs that deal with
                 heap storage. Its success owes to the fundamental
                 principle that one should keep separate areas of the
                 heap storage separate in program reasoning. However,
                 the way Separation Logic deals with program variables
                 continues to be based on traditional Hoare Logic
                 without taking any benefit of the separation principle.
                 This has led to unwieldy proof rules suffering from
                 lack of clarity as well as questions surrounding their
                 soundness. In this paper, we extend the separation idea
                 to the treatment of variables in Separation Logic,
                 especially Concurrent Separation Logic, using the
                 system of Syntactic Control of Interference proposed by
                 Reynolds in 1978. We extend the original system with
                 permission algebras, making it more powerful and able
                 to deal with the issues of concurrent programs. The
                 result is a streamined presentation of Concurrent
                 Separation Logic, whose rules are memorable and
                 soundness obvious. We also include a discussion of how
                 the new rules impact the semantics and devise static
                 analysis techniques to infer the required permissions
                 automatically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Licata:2012:CDT,
  author =       "Daniel R. Licata and Robert Harper",
  title =        "Canonicity for $2$-dimensional type theory",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "337--348",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103697",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Higher-dimensional dependent type theory enriches
                 conventional one-dimensional dependent type theory with
                 additional structure expressing equivalence of elements
                 of a type. This structure may be employed in a variety
                 of ways to capture rather coarse identifications of
                 elements, such as a universe of sets considered modulo
                 isomorphism. Equivalence must be respected by all
                 families of types and terms, as witnessed
                 computationally by a type-generic program.
                 Higher-dimensional type theory has applications to code
                 reuse for dependently typed programming, and to the
                 formalization of mathematics. In this paper, we develop
                 a novel judgemental formulation of a two-dimensional
                 type theory, which enjoys a canonicity property: a
                 closed term of boolean type is definitionally equal to
                 true or false. Canonicity is a necessary condition for
                 a computational interpretation of type theory as a
                 programming language, and does not hold for existing
                 axiomatic presentations of higher-dimensional type
                 theory. The method of proof is a generalization of the
                 NuPRL semantics, interpreting types as syntactic
                 groupoids rather than equivalence relations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Kammar:2012:AFE,
  author =       "Ohad Kammar and Gordon D. Plotkin",
  title =        "Algebraic foundations for effect-dependent
                 optimisations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "349--360",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103698",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a general theory of Gifford-style type and
                 effect annotations, where effect annotations are sets
                 of effects. Generality is achieved by recourse to the
                 theory of algebraic effects, a development of Moggi's
                 monadic theory of computational effects that emphasises
                 the operations causing the effects at hand and their
                 equational theory. The key observation is that
                 annotation effects can be identified with operation
                 symbols. We develop an annotated version of Levy's
                 Call-by-Push-Value language with a kind of computations
                 for every effect set; it can be thought of as a
                 sequential, annotated intermediate language. We develop
                 a range of validated optimisations (i.e.,
                 equivalences), generalising many existing ones and
                 adding new ones. We classify these optimisations as
                 structural, algebraic, or abstract: structural
                 optimisations always hold; algebraic ones depend on the
                 effect theory at hand; and abstract ones depend on the
                 global nature of that theory (we give
                 modularly-checkable sufficient conditions for their
                 validity).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Cretin:2012:PCA,
  author =       "Julien Cretin and Didier R{\'e}my",
  title =        "On the power of coercion abstraction",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "361--372",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103699",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Erasable coercions in System F-eta, also known as
                 retyping functions, are well-typed eta-expansions of
                 the identity. They may change the type of terms without
                 changing their behavior and can thus be erased before
                 reduction. Coercions in F-eta can model subtyping of
                 known types and some displacement of quantifiers, but
                 not subtyping assumptions nor certain forms of delayed
                 type instantiation. We generalize F-eta by allowing
                 abstraction over retyping functions. We follow a
                 general approach where computing with coercions can be
                 seen as computing in the lambda-calculus but keeping
                 track of which parts of terms are coercions. We obtain
                 a language where coercions do not contribute to the
                 reduction but may block it and are thus not erasable.
                 We recover erasable coercions by choosing a weak
                 reduction strategy and restricting coercion abstraction
                 to value-forms or by restricting abstraction to
                 coercions that are polymorphic in their domain or
                 codomain. The latter variant subsumes F-eta, F-sub, and
                 MLF in a unified framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Naik:2012:AT,
  author =       "Mayur Naik and Hongseok Yang and Ghila Castelnuovo and
                 Mooly Sagiv",
  title =        "Abstractions from tests",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "373--386",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103701",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a framework for leveraging dynamic analysis
                 to find good abstractions for static analysis. A static
                 analysis in our framework is parametrised. Our main
                 insight is to directly and efficiently compute from a
                 concrete trace, a necessary condition on the parameter
                 configurations to prove a given query, and thereby
                 prune the space of parameter configurations that the
                 static analysis must consider. We provide constructive
                 algorithms for two instance analyses in our framework:
                 a flow- and context-sensitive thread-escape analysis
                 and a flow- and context-insensitive points-to analysis.
                 We show the efficacy of these analyses, and our
                 approach, on six Java programs comprising two million
                 bytecodes: the thread-escape analysis resolves 80\% of
                 queries on average, disproving 28\% and proving 52\%;
                 the points-to analysis resolves 99\% of queries on
                 average, disproving 29\% and proving 70\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Smaragdakis:2012:SPR,
  author =       "Yannis Smaragdakis and Jacob Evans and Caitlin
                 Sadowski and Jaeheon Yi and Cormac Flanagan",
  title =        "Sound predictive race detection in polynomial time",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "387--400",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103702",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data races are among the most reliable indicators of
                 programming errors in concurrent software. For at least
                 two decades, Lamport's happens-before (HB) relation has
                 served as the standard test for detecting races--other
                 techniques, such as lockset-based approaches, fail to
                 be sound, as they may falsely warn of races. This work
                 introduces a new relation, causally-precedes (CP),
                 which generalizes happens-before to observe more races
                 without sacrificing soundness. Intuitively, CP tries to
                 capture the concept of happens-before ordered events
                 that must occur in the observed order for the program
                 to observe the same values. What distinguishes CP from
                 past predictive race detection approaches (which also
                 generalize an observed execution to detect races in
                 other plausible executions) is that CP-based race
                 detection is both sound and of polynomial complexity.
                 We demonstrate that the unique aspects of CP result in
                 practical benefit. Applying CP to real-world programs,
                 we successfully analyze server-level applications
                 (e.g., Apache FtpServer) and show that traces longer
                 than in past predictive race analyses can be analyzed
                 in mere seconds to a few minutes. For these programs,
                 CP race detection uncovers races that are hard to
                 detect by repeated execution and HB race detection: a
                 single run of CP race detection produces several races
                 not discovered by 10 separate rounds of happens-before
                 race detection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Bojanczyk:2012:TNC,
  author =       "Mikolaj Bojanczyk and Laurent Braud and Bartek Klin
                 and Slawomir Lasota",
  title =        "Towards nominal computation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "401--412",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103704",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nominal sets are a different kind of set theory, with
                 a more relaxed notion of finiteness. They offer an
                 elegant formalism for describing lambda-terms modulo
                 alpha-conversion, or automata on data words. This paper
                 is an attempt at defining computation in nominal sets.
                 We present a rudimentary programming language, called
                 Nlambda. The key idea is that it includes a native type
                 for finite sets in the nominal sense. To illustrate the
                 power of our language, we write short programs that
                 process automata on data words.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Cave:2012:PBI,
  author =       "Andrew Cave and Brigitte Pientka",
  title =        "Programming with binders and indexed data-types",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "413--424",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103705",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We show how to combine a general purpose type system
                 for an existing language with support for programming
                 with binders and contexts by refining the type system
                 of ML with a restricted form of dependent types where
                 index objects are drawn from contextual LF. This allows
                 the user to specify formal systems within the logical
                 framework LF and index ML types with contextual LF
                 objects. Our language design keeps the index language
                 generic only requiring decidability of equality of the
                 index language providing a modular design. To
                 illustrate the elegance and effectiveness of our
                 language, we give programs for closure conversion and
                 normalization by evaluation. Our three key technical
                 contribution are: (1) We give a bi-directional type
                 system for our core language which is centered around
                 refinement substitutions instead of constraint solving.
                 As a consequence, type checking is decidable and easy
                 to trust, although constraint solving may be
                 undecidable. (2) We give a big-step environment based
                 operational semantics with environments which lends
                 itself to efficient implementation. (3) We prove our
                 language to be type safe and have mechanized our
                 theoretical development in the proof assistant Coq
                 using the fresh approach to binding.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Moore:2012:MLF,
  author =       "J. Strother Moore",
  title =        "Meta-level features in an industrial-strength theorem
                 prover",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "425--426",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103707",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The ACL2 theorem prover---the current incarnation of
                 `the' Boyer--Moore theorem prover---is a theorem prover
                 for an extension of a first-order, applicative subset
                 of Common Lisp. The ACL2 system provides a useful
                 specification and modeling language as well as a useful
                 mechanical theorem proving environment. ACL2 is in use
                 at several major microprocessor manufacturers to verify
                 functional correctness of important components of
                 commercial designs. This talk explores the design of
                 ACL2 and the tradeoffs that have turned out to be
                 pivotal to its success.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Zhao:2012:FLI,
  author =       "Jianzhou Zhao and Santosh Nagarakatte and Milo M. K.
                 Martin and Steve Zdancewic",
  title =        "Formalizing the {LLVM} intermediate representation for
                 verified program transformations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "427--440",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103709",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents Vellvm (verified LLVM), a
                 framework for reasoning about programs expressed in
                 LLVM's intermediate representation and transformations
                 that operate on it. Vellvm provides a mechanized formal
                 semantics of LLVM's intermediate representation, its
                 type system, and properties of its SSA form. The
                 framework is built using the Coq interactive theorem
                 prover. It includes multiple operational semantics and
                 proves relations among them to facilitate different
                 reasoning styles and proof techniques. To validate
                 Vellvm's design, we extract an interpreter from the Coq
                 formal semantics that can execute programs from LLVM
                 test suite and thus be compared against LLVM reference
                 implementations. To demonstrate Vellvm's practicality,
                 we formalize and verify a previously proposed
                 transformation that hardens C programs against spatial
                 memory safety violations. Vellvm's tools allow us to
                 extract a new, verified implementation of the
                 transformation pass that plugs into the real LLVM
                 infrastructure; its performance is competitive with the
                 non-verified, ad-hoc original.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Zhu:2012:RAA,
  author =       "Zeyuan Allen Zhu and Sasa Misailovic and Jonathan A.
                 Kelner and Martin Rinard",
  title =        "Randomized accuracy-aware program transformations for
                 efficient approximate computations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "441--454",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103710",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the fact that approximate computations have
                 come to dominate many areas of computer science, the
                 field of program transformations has focused almost
                 exclusively on traditional semantics-preserving
                 transformations that do not attempt to exploit the
                 opportunity, available in many computations, to
                 acceptably trade off accuracy for benefits such as
                 increased performance and reduced resource consumption.
                 We present a model of computation for approximate
                 computations and an algorithm for optimizing these
                 computations. The algorithm works with two classes of
                 transformations: substitution transformations (which
                 select one of a number of available implementations for
                 a given function, with each implementation offering a
                 different combination of accuracy and resource
                 consumption) and sampling transformations (which
                 randomly discard some of the inputs to a given
                 reduction). The algorithm produces a $ (1 + \epsilon) $
                 randomized approximation to the optimal randomized
                 computation (which minimizes resource consumption
                 subject to a probabilistic accuracy specification in
                 the form of a maximum expected error or maximum error
                 variance).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Liang:2012:RGB,
  author =       "Hongjin Liang and Xinyu Feng and Ming Fu",
  title =        "A rely-guarantee-based simulation for verifying
                 concurrent program transformations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "455--468",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103711",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Verifying program transformations usually requires
                 proving that the resulting program (the target) refines
                 or is equivalent to the original one (the source).
                 However, the refinement relation between individual
                 sequential threads cannot be preserved in general with
                 the presence of parallel compositions, due to
                 instruction reordering and the different granularities
                 of atomic operations at the source and the target. On
                 the other hand, the refinement relation defined based
                 on fully abstract semantics of concurrent programs
                 assumes arbitrary parallel environments, which is too
                 strong and cannot be satisfied by many well-known
                 transformations. In this paper, we propose a
                 Rely-Guarantee-based Simulation (RGSim) to verify
                 concurrent program transformations. The relation is
                 parametrized with constraints of the environments that
                 the source and the target programs may compose with. It
                 considers the interference between threads and their
                 environments, thus is less permissive than relations
                 over sequential programs. It is compositional w.r.t.
                 parallel compositions as long as the constraints are
                 satisfied. Also, RGSim does not require semantics
                 preservation under all environments, and can
                 incorporate the assumptions about environments made by
                 specific program transformations in the form of
                 rely/guarantee conditions. We use RGSim to reason about
                 optimizations and prove atomicity of concurrent
                 objects. We also propose a general garbage collector
                 verification framework based on RGSim, and verify the
                 Boehm et al. concurrent mark-sweep GC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Balabonski:2012:UAF,
  author =       "Thibaut Balabonski",
  title =        "A unified approach to fully lazy sharing",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "469--480",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103713",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We give an axiomatic presentation of
                 sharing-via-labelling for weak lambda-calculi, that
                 makes it possible to formally compare many different
                 approaches to fully lazy sharing, and obtain two
                 important results. We prove that the known
                 implementations of full laziness are all equivalent in
                 terms of the number of beta-reductions performed,
                 although they behave differently regarding the
                 duplication of terms. We establish a link between the
                 optimality theories of weak lambda-calculi and
                 first-order rewriting systems by expressing fully lazy
                 lambda-lifting in our framework, thus emphasizing the
                 first-order essence of weak reduction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Rastogi:2012:IOG,
  author =       "Aseem Rastogi and Avik Chaudhuri and Basil Hosmer",
  title =        "The ins and outs of gradual type inference",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "481--494",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103714",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Gradual typing lets programmers evolve their
                 dynamically typed programs by gradually adding explicit
                 type annotations, which confer benefits like improved
                 performance and fewer run-time failures. However, we
                 argue that such evolution often requires a giant leap,
                 and that type inference can offer a crucial missing
                 step. If omitted type annotations are interpreted as
                 unknown types, rather than the dynamic type, then
                 static types can often be inferred, thereby removing
                 unnecessary assumptions of the dynamic type. The
                 remaining assumptions of the dynamic type may then be
                 removed by either reasoning outside the static type
                 system, or restructuring the code. We present a type
                 inference algorithm that can improve the performance of
                 existing gradually typed programs without introducing
                 any new run-time failures. To account for dynamic
                 typing, types that flow in to an unknown type are
                 treated in a fundamentally different manner than types
                 that flow out. Furthermore, in the interests of
                 backward-compatibility, an escape analysis is conducted
                 to decide which types are safe to infer. We have
                 implemented our algorithm for ActionScript, and
                 evaluated it on the SunSpider and V8 benchmark suites.
                 We demonstrate that our algorithm can improve the
                 performance of unannotated programs as well as recover
                 most of the type annotations in annotated programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Hofmann:2012:EL,
  author =       "Martin Hofmann and Benjamin Pierce and Daniel Wagner",
  title =        "Edit lenses",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "495--508",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103715",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A lens is a bidirectional transformation between a
                 pair of connected data structures, capable of
                 translating an edit on one structure into an
                 appropriate edit on the other. Many varieties of lenses
                 have been studied, but none, to date, has offered a
                 satisfactory treatment of how edits are represented.
                 Many foundational accounts only consider edits of the
                 form `overwrite the whole structure,' leading to poor
                 behavior in many situations by failing to track the
                 associations between corresponding parts of the
                 structures when elements are inserted and deleted in
                 ordered lists, for example. Other theories of lenses do
                 maintain these associations, either by annotating the
                 structures themselves with change information or using
                 auxiliary data structures, but every extant theory
                 assumes that the entire original source structure is
                 part of the information passed to the lens. We offer a
                 general theory of edit lenses, which work with
                 descriptions of changes to structures, rather than with
                 the structures themselves. We identify a simple notion
                 of `editable structure'--a set of states plus a monoid
                 of edits with a partial monoid action on the
                 states--and construct a semantic space of lenses
                 between such structures, with natural laws governing
                 their behavior. We show how a range of constructions
                 from earlier papers on `state-based' lenses can be
                 carried out in this space, including composition,
                 products, sums, list operations, etc. Further, we show
                 how to construct edit lenses for arbitrary containers
                 in the sense of Abbott, Altenkirch, and Ghani. Finally,
                 we show that edit lenses refine a well-known
                 formulation of state-based lenses, in the sense that
                 every state-based lens gives rise to an edit lens over
                 structures with a simple overwrite-only edit language,
                 and conversely every edit lens on such structures gives
                 rise to a state-based lens.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Batty:2012:CCC,
  author =       "Mark Batty and Kayvan Memarian and Scott Owens and
                 Susmit Sarkar and Peter Sewell",
  title =        "Clarifying and compiling {C\slash C++} concurrency:
                 from {C++11} to {POWER}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "509--520",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103717",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The upcoming C and C++ revised standards add
                 concurrency to the languages, for the first time, in
                 the form of a subtle *relaxed memory model* (the *C++11
                 model*). This aims to permit compiler optimisation and
                 to accommodate the differing relaxed-memory behaviours
                 of mainstream multiprocessors, combining simple
                 semantics for most code with high-performance
                 *low-level atomics* for concurrency libraries. In this
                 paper, we first establish two simpler but provably
                 equivalent models for C++11, one for the full language
                 and another for the subset without consume operations.
                 Subsetting further to the fragment without low-level
                 atomics, we identify a subtlety arising from atomic
                 initialisation and prove that, under an additional
                 condition, the model is equivalent to sequential
                 consistency for race-free programs. We then prove our
                 main result, the correctness of two proposed
                 compilation schemes for the C++11 load and store
                 concurrency primitives to Power assembly, having noted
                 that an earlier proposal was flawed. (The main ideas
                 apply also to ARM, which has a similar relaxed memory
                 architecture.) This should inform the ongoing
                 development of production compilers for C++11 and C1x,
                 clarifies what properties of the machine architecture
                 are required, and builds confidence in the C++11 and
                 Power semantics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Ramananandro:2012:MSC,
  author =       "Tahina Ramananandro and Gabriel {Dos Reis} and Xavier
                 Leroy",
  title =        "A mechanized semantics for {C++} object construction
                 and destruction, with applications to resource
                 management",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "521--532",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103718",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a formal operational semantics and its Coq
                 mechanization for the C++ object model, featuring
                 object construction and destruction, shared and
                 repeated multiple inheritance, and virtual function
                 call dispatch. These are key C++ language features for
                 high-level system programming, in particular for
                 predictable and reliable resource management. This
                 paper is the first to present a formal mechanized
                 account of the metatheory of construction and
                 destruction in C++, and applications to popular
                 programming techniques such as `resource acquisition is
                 initialization'. We also report on irregularities and
                 apparent contradictions in the ISO C++03 and C++11
                 standards.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Ellison:2012:EFS,
  author =       "Chucky Ellison and Grigore Rosu",
  title =        "An executable formal semantics of {C} with
                 applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "533--544",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103719",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes an executable formal semantics of
                 C. Being executable, the semantics has been thoroughly
                 tested against the GCC torture test suite and
                 successfully passes 99.2\% of 776 test programs. It is
                 the most complete and thoroughly tested formal
                 definition of C to date. The semantics yields an
                 interpreter, debugger, state space search tool, and
                 model checker `for free'. The semantics is shown
                 capable of automatically finding program errors, both
                 statically and at runtime. It is also used to enumerate
                 nondeterministic behavior.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Bhat:2012:TTP,
  author =       "Sooraj Bhat and Ashish Agarwal and Richard Vuduc and
                 Alexander Gray",
  title =        "A type theory for probability density functions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "545--556",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103721",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There has been great interest in creating
                 probabilistic programming languages to simplify the
                 coding of statistical tasks; however, there still does
                 not exist a formal language that simultaneously
                 provides (1) continuous probability distributions, (2)
                 the ability to naturally express custom probabilistic
                 models, and (3) probability density functions (PDFs).
                 This collection of features is necessary for
                 mechanizing fundamental statistical techniques. We
                 formalize the first probabilistic language that
                 exhibits these features, and it serves as a
                 foundational framework for extending the ideas to more
                 general languages. Particularly novel are our type
                 system for absolutely continuous (AC) distributions
                 (those which permit PDFs) and our PDF calculation
                 procedure, which calculates PDFs for a large class of
                 AC distributions. Our formalization paves the way
                 toward the rigorous encoding of powerful statistical
                 reformulations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Naden:2012:TSB,
  author =       "Karl Naden and Robert Bocchino and Jonathan Aldrich
                 and Kevin Bierhoff",
  title =        "A type system for borrowing permissions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "557--570",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103722",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In object-oriented programming, unique permissions to
                 object references are useful for checking correctness
                 properties such as consistency of typestate and
                 noninterference of concurrency. To be usable, unique
                 permissions must be borrowed --- for example, one must
                 be able to read a unique reference out of a field, use
                 it for something, and put it back. While one can null
                 out the field and later reassign it, this paradigm is
                 ungainly and requires unnecessary writes, potentially
                 hurting cache performance. Therefore, in practice
                 borrowing must occur in the type system, without
                 requiring memory updates. Previous systems support
                 borrowing with external alias analysis and/or explicit
                 programmer management of fractional permissions. While
                 these approaches are powerful, they are also awkward
                 and difficult for programmers to understand. We present
                 an integrated language and type system with unique,
                 immutable, and shared permissions, together with new
                 local permissions that say that a reference may not be
                 stored to the heap. Our system also includes change
                 permissions such as unique {\tt >>} unique and unique
                 {\tt >>} none that describe how permissions flow in and
                 out of method formal parameters. Together, these
                 features support common patterns of borrowing,
                 including borrowing multiple local permissions from a
                 unique reference and recovering the unique reference
                 when the local permissions go out of scope, without any
                 explicit management of fractions in the source
                 language. All accounting of fractional permissions is
                 done by the type system `under the hood.' We present
                 the syntax and static and dynamic semantics of a formal
                 core language and state soundness results. We also
                 illustrate the utility and practicality of our design
                 by using it to express several realistic examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{Strub:2012:SCB,
  author =       "Pierre-Yves Strub and Nikhil Swamy and Cedric Fournet
                 and Juan Chen",
  title =        "Self-certification: bootstrapping certified
                 typecheckers in {F*} with {Coq}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "1",
  pages =        "571--584",
  month =        jan,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2103621.2103723",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Mar 15 18:16:55 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Well-established dependently-typed languages like Agda
                 and Coq provide reliable ways to build and check formal
                 proofs. Several other dependently-typed languages such
                 as Aura, ATS, Cayenne, Epigram, F*, F7, Fine, Guru,
                 PCML5, and Ur also explore reliable ways to develop and
                 verify programs. All these languages shine in their own
                 regard, but their implementations do not themselves
                 enjoy the degree of safety provided by machine-checked
                 verification. We propose a general technique called
                 self-certification that allows a typechecker for a
                 suitably expressive language to be certified for
                 correctness. We have implemented this technique for F*,
                 a dependently typed language on the {.NET} platform.
                 Self-certification involves implementing a typechecker
                 for F* in F*, while using all the conveniences F*
                 provides for the compiler-writer (e.g., partiality,
                 effects, implicit conversions, proof automation,
                 libraries). This typechecker is given a specification
                 (in F*) strong enough to ensure that it computes valid
                 typing derivations. We obtain a typing derivation for
                 the core typechecker by running it on itself, and we
                 export it to Coq as a type-derivation certificate. By
                 typechecking this derivation (in Coq) and applying the
                 F* metatheory (also mechanized in Coq), we conclude
                 that our type checker is correct. Once certified in
                 this manner, the F* typechecker is emancipated from
                 Coq.\par

                 Self-certification leads to an efficient certification
                 scheme --- we no longer depend on verifying
                 certificates in Coq --- as well as a more broadly
                 applicable one. For instance, the self-certified F*
                 checker is suitable for use in adversarial settings
                 where Coq is not intended for use, such as run-time
                 certification of mobile code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '12 conference proceedings.",
}

@Article{DHondt:2012:ISS,
  author =       "Theo D'Hondt",
  title =        "An interpreter for server-side {HOP}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "1--12",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047851",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "HOP is a Scheme-based multi-tier programming language
                 for the Web. The client-side of a program is compiled
                 to JavaScript, while the server-side is executed by a
                 mix of natively compiled code and interpreted code. At
                 the time where HOP programs were basic scripts, the
                 performance of the server-side interpreter was not a
                 concern; an inefficient interpreter was acceptable. As
                 HOP expanded, HOP programs got larger and more complex.
                 A more efficient interpreter was necessary. This new
                 interpreter is described in this paper. It is compact,
                 its whole implementation counting no more than 2.5
                 KLOC. It is more than twice faster than the old
                 interpreter and consumes less than a third of its
                 memory. Although it cannot compete with static or JIT
                 native compilers, our experimental results show that it
                 is amongst the fastest interpreters for dynamic
                 languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Chang:2012:IOT,
  author =       "Mason Chang and Bernd Mathiske and Edwin Smith and
                 Avik Chaudhuri and Andreas Gal and Michael Bebenita and
                 Christian Wimmer and Michael Franz",
  title =        "The impact of optional type information on {JIT}
                 compilation of dynamically typed languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "13--24",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047853",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Optionally typed languages enable direct performance
                 comparisons between untyped and type annotated source
                 code. We present a comprehensive performance evaluation
                 of two different JIT compilers in the context of
                 ActionScript, a production-quality optionally typed
                 language. One JIT compiler is optimized for quick
                 compilation rather than JIT compiled code performance.
                 The second JIT compiler is a more aggressively
                 optimizing compiler, performing both high-level and
                 low-level optimizations. We evaluate both JIT compilers
                 directly on the same benchmark suite, measuring their
                 performance changes across fully typed, partially
                 typed, and untyped code. Such evaluations are
                 especially relevant to dynamically typed languages such
                 as JavaScript, which are currently evaluating the idea
                 of adding optional type annotations. We demonstrate
                 that low-level optimizations rarely accelerate the
                 program enough to pay back the investment into
                 performing them in an optionally typed language. Our
                 experiments and data demonstrate that high-level
                 optimizations are required to improve performance by
                 any significant amount.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Homescu:2012:HTJ,
  author =       "Andrei Homescu and Alex Suhan",
  title =        "{HappyJIT}: a tracing {JIT} compiler for {PHP}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "25--36",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047854",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Current websites are a combination of server-generated
                 dynamic content with client-side interactive programs.
                 Dynamically --- typed languages have gained a lot of
                 ground in both of these domains. The growth of Web 2.0
                 has introduced a myriad of websites which contain
                 personalized content, which is specific to the user.
                 PHP or Python programs generate the actual HTML page
                 after querying a database and processing the results,
                 which are then presented by the browser. It is becoming
                 more and more vital to accelerate the execution of
                 these programs, as this is a significant part of the
                 total time needed to present the page to the user. This
                 paper presents a novel interpreter for the PHP language
                 written in RPython, which the PyPy translator then
                 translates into C. The translator integrates into the
                 interpreter a tracing just-in-time compiler which
                 optimizes the hottest loops in the interpreted
                 programs. We also describe a data model that supports
                 all the data types in the PHP language, such as
                 references and iterators. We evaluate the performance
                 of this interpreter, showing that speedups up to a
                 factor of 8 are observed using this approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Zhao:2012:PTI,
  author =       "Tian Zhao",
  title =        "Polymorphic type inference for scripting languages
                 with object extensions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "37--50",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047855",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a polymorphic type inference
                 algorithm for a small subset of JavaScript. The goal is
                 to prevent accessing undefined members of objects. We
                 define a type system that allows explicit extension of
                 objects through add operation and implicit extension
                 through method calls. The type system also permits
                 strong updates and unrestricted extensions to new
                 objects. The type inference algorithm is modular so
                 that each function definition is only analyzed once and
                 larger programs can be checked incrementally.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Hirschfeld:2012:EUC,
  author =       "Robert Hirschfeld and Michael Perscheid and Michael
                 Haupt",
  title =        "Explicit use-case representation in object-oriented
                 programming languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "51--60",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047856",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Use-cases are considered an integral part of most
                 contemporary development processes since they describe
                 a software system's expected behavior from the
                 perspective of its prospective users. However, the
                 presence of and traceability to use-cases is
                 increasingly lost in later more code-centric
                 development activities. Use-cases, being
                 well-encapsulated at the level of requirements
                 descriptions, eventually lead to crosscutting concerns
                 in system design and source code. Tracing which parts
                 of the system contribute to which use-cases is
                 therefore hard and so limits understandability. In this
                 paper, we propose an approach to making use-cases
                 first-class entities in both the programming language
                 and the runtime environment. Having use-cases present
                 in the code and the running system will allow
                 developers, maintainers, and operators to easily
                 associate their units of work with what matters to the
                 users. We suggest the combination of use-cases,
                 acceptance tests, and dynamic analysis to automatically
                 associate source code with use-cases. We present
                 UseCasePy, an implementation of our approach to
                 use-case-centered development in Python, and its
                 application to the Django Web framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Chevalier-Boisvert:2012:BSH,
  author =       "Maxime Chevalier-Boisvert and Erick Lavoie and Marc
                 Feeley and Bruno Dufour",
  title =        "Bootstrapping a self-hosted research virtual machine
                 for {JavaScript}: an experience report",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "61--72",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047858",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "JavaScript is one of the most widely used dynamic
                 languages. The performance of existing JavaScript VMs,
                 however, is lower than that of VMs for static
                 languages. There is a need for a research VM to easily
                 explore new implementation approaches. This paper
                 presents the Tachyon JavaScript VM which was designed
                 to be flexible and to allow experimenting with new
                 approaches for the execution of JavaScript. The Tachyon
                 VM is itself implemented in JavaScript and currently
                 supports a subset of the full language that is
                 sufficient to bootstrap itself. The paper discusses the
                 architecture of the system and in particular the
                 bootstrapping of a self-hosted VM. Preliminary
                 performance results indicate that our VM, with few
                 optimizations, can already execute code faster than a
                 commercial JavaScript interpreter on some benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Klock:2012:BLR,
  author =       "Felix S. {Klock II} and William D. Clinger",
  title =        "Bounded-latency regional garbage collection",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "73--84",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047859",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Regional garbage collection is scalable, with
                 theoretical worst-case bounds for gc latency, MMU, and
                 throughput that are independent of mutator behavior and
                 the volume of reachable storage. Regional collection
                 improves upon the worst-case pause times and MMU seen
                 in most other general-purpose collectors, including
                 garbage-first and concurrent mark\slash sweep
                 collectors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Tew:2012:PAM,
  author =       "Kevin Tew and James Swaine and Matthew Flatt and
                 Robert Bruce Findler and Peter Dinda",
  title =        "{Places}: adding message-passing parallelism to
                 {Racket}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "85--96",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047860",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Places bring new support for message-passing
                 parallelism to Racket. This paper gives an overview of
                 the programming model and how we had to modify our
                 existing, sequential runtime-system to support places.
                 We show that the freedom to design the programming
                 model helped us to make the implementation tractable;
                 specifically, we avoided the conventional pain of
                 adding just the right amount of locking to a big,
                 legacy runtime system. The paper presents an evaluation
                 of the design that includes both a real-world
                 application and standard parallel benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Stuchlik:2012:SVD,
  author =       "Andreas Stuchlik and Stefan Hanenberg",
  title =        "Static vs. dynamic type systems: an empirical study
                 about the relationship between type casts and
                 development time",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "2",
  pages =        "97--106",
  month =        feb,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2168696.2047861",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Apr 20 17:34:09 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Static type systems are essential in computer science.
                 However, there is hardly any knowledge about the impact
                 of type systems on the resulting piece of software.
                 While there are authors that state that static types
                 increase the development speed, other authors argue the
                 other way around. A previous experiment suggests that
                 there are multiple factors that play a role for a
                 comparison of statically and dynamically typed
                 language. As a follow-up, this paper presents an
                 empirical study with 21 subjects that compares
                 programming tasks performed in Java and Groovy ---
                 programming tasks where the number of expected type
                 casts vary in the statically typed language. The result
                 of the study is, that the dynamically typed group
                 solved the complete programming tasks significantly
                 faster for most tasks --- but that for larger tasks
                 with a higher number of type casts no significant
                 difference could be found.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DSL '11 conference proceedings.",
}

@Article{Schultz:2012:MCP,
  author =       "Ulrik P. Schultz",
  title =        "Multilingual component programming in {Racket}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "1--2",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047864",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the world of Racket, software systems consist of
                 inter-operating components in different programming
                 languages. A component's implementation language may
                 provide the full functionality of Racket, or it may
                 support a small domain-specific notation. Naturally,
                 Racketeers construct languages as Racket components and
                 compose them to create new languages. This talk will
                 present the ideas behind Racket: language-specific
                 components, the composition of components, and, most
                 importantly, the rich support for building languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Rosenmuller:2012:TDS,
  author =       "Marko Rosenm{\"u}ller and Norbert Siegmund and Mario
                 Pukall and Sven Apel",
  title =        "Tailoring dynamic software product lines",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "3--12",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047866",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software product lines (SPLs) and adaptive systems aim
                 at variability to cope with changing requirements.
                 Variability can be described in terms of features,
                 which are central for development and configuration of
                 SPLs. In traditional SPLs, features are bound
                 statically before runtime. By contrast, adaptive
                 systems support feature binding at runtime and are
                 sometimes called dynamic SPLs (DSPLs). DSPLs are
                 usually built from coarse-grained components, which
                 reduces the number of possible application scenarios.
                 To overcome this limitation, we closely integrate
                 static binding of traditional SPLs and runtime
                 adaptation of DSPLs. We achieve this integration by
                 statically generating a tailor-made DSPL from a highly
                 customizable SPL. The generated DSPL provides only the
                 runtime variability required by a particular
                 application scenario and the execution environment. The
                 DSPL supports self-configuration based on
                 coarse-grained modules. We provide a feature-based
                 adaptation mechanism that reduces the effort of
                 computing an optimal configuration at runtime. In a
                 case study, we demonstrate the practicability of our
                 approach and show that a seamless integration of static
                 binding and runtime adaptation reduces the complexity
                 of the adaptation process.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Batory:2012:FIP,
  author =       "Don Batory and Peter H{\"o}fner and Jongwook Kim",
  title =        "Feature interactions, products, and composition",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "13--22",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047867",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The relationship between feature modules and feature
                 interactions is not well-understood. To explain classic
                 examples of feature interaction, we show that features
                 are not only composed sequentially, but also by
                 cross-product and interaction operations that
                 heretofore were implicit in the literature. Using the
                 Colored IDE (CIDE) tool as our starting point, we (a)
                 present a formal model of these operations, (b) show
                 how it connects and explains previously unrelated
                 results in Feature Oriented Software Development
                 (FOSD), and (c) describe a tool, based on our
                 formalism, that demonstrates how changes in composed
                 documents can be back-propagated to their original
                 feature module definitions, thereby improving FOSD
                 tooling.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Ribeiro:2012:IFD,
  author =       "M{\'a}rcio Ribeiro and Felipe Queiroz and Paulo Borba
                 and T{\'a}rsis Tol{\^e}do and Claus Brabrand and
                 S{\'e}rgio Soares",
  title =        "On the impact of feature dependencies when maintaining
                 preprocessor-based software product lines",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "23--32",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047868",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "During Software Product Line (SPL) maintenance tasks,
                 Virtual Separation of Concerns (VSoC) allows the
                 programmer to focus on one feature and hide the others.
                 However, since features depend on each other through
                 variables and control-flow, feature modularization is
                 compromised since the maintenance of one feature may
                 break another. In this context, emergent interfaces can
                 capture dependencies between the feature we are
                 maintaining and the others, making developers aware of
                 dependencies. To better understand the impact of code
                 level feature dependencies during SPL maintenance, we
                 have investigated the following two questions: how
                 often methods with preprocessor directives contain
                 feature dependencies? How feature dependencies impact
                 maintenance effort when using VSoC and emergent
                 interfaces? Answering the former is important for
                 assessing how often we may face feature dependency
                 problems. Answering the latter is important to better
                 understand to what extent emergent interfaces
                 complement VSoC during maintenance tasks. To answer
                 them, we analyze 43 SPLs of different domains, size,
                 and languages. The data we collect from them complement
                 previous work on preprocessor usage. They reveal that
                 the feature dependencies we consider in this paper are
                 reasonably common in practice; and that emergent
                 interfaces can reduce maintenance effort during the SPL
                 maintenance tasks we regard here.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Neves:2012:ISE,
  author =       "La{\'\i}s Neves and Leopoldo Teixeira and
                 Dem{\'o}stenes Sena and Vander Alves and Uir{\'a}
                 Kulezsa and Paulo Borba",
  title =        "Investigating the safe evolution of software product
                 lines",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "33--42",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047869",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The adoption of a product line strategy can bring
                 significant productivity and time to market
                 improvements. However, evolving a product line is risky
                 because it might impact many products and their users.
                 So when evolving a product line to introduce new
                 features or to improve its design, it is important to
                 make sure that the behavior of existing products is not
                 affected. In fact, to preserve the behavior of existing
                 products one usually has to analyze different
                 artifacts, like feature models, configuration knowledge
                 and the product line core assets. To better understand
                 this process, in this paper we discover and analyze
                 concrete product line evolution scenarios and, based on
                 the results of this study, we describe a number of safe
                 evolution templates that developers can use when
                 working with product lines. For each template, we show
                 examples of their use in existing product lines. We
                 evaluate the templates by also analyzing the evolution
                 history of two different product lines and
                 demonstrating that they can express the corresponding
                 modifications and then help to avoid the mistakes that
                 we identified during our analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Hannousse:2012:SAA,
  author =       "Abdelhakim Hannousse and R{\'e}mi Douence and Gilles
                 Ardourel",
  title =        "Static analysis of aspect interaction and composition
                 in component models",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "43--52",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047871",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Component based software engineering and aspect
                 orientation are claimed to be two complementary
                 approaches. While the former ensures the modularity and
                 the reusability of software entities, the latter
                 enables the modularity of crosscutting concerns that
                 cannot be modularized as regular components. Nowadays,
                 several approaches and frameworks are dedicated to
                 integrate aspects into component models. However, when
                 several aspects are woven, aspects may interact with
                 each other which often results in undesirable behavior.
                 The contribution of this paper is twofold. First, we
                 show how aspectized component models can be formally
                 modeled in UPPAAL model checker in order to detect
                 negative interactions (a.k.a., interferences) among
                 aspects. Second, we provide an extendible catalog of
                 composition operators used for aspect composition. We
                 illustrate our general approach with an airport
                 Internet service example.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Otte:2012:ICB,
  author =       "William R. Otte and Aniruddha Gokhale and Douglas C.
                 Schmidt and Johnny Willemsen",
  title =        "Infrastructure for component-based {DDS} application
                 development",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "53--62",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047872",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Enterprise distributed real-time and embedded (DRE)
                 systems are increasingly being developed with the use
                 of component-based software techniques. Unfortunately,
                 commonly used component middleware platforms provide
                 limited support for event-based publish/subscribe
                 (pub/sub) mechanisms that meet both quality-of-service
                 (QoS) and configurability requirements of DRE systems.
                 On the other hand, although pub/sub technologies, such
                 as OMG Data Distribution Service (DDS), support a wide
                 range of QoS settings, the level of abstraction they
                 provide make it hard to configure them due to the
                 significant source-level configuration that must be
                 hard-coded at compile time or tailored at run-time
                 using proprietary, ad hoc configuration logic.
                 Moreover, developers of applications using native
                 pub/sub technologies must write large amounts of
                 boilerplate ``glue'' code to support run-time
                 configuration of QoS properties, which is tedious and
                 error-prone. This paper describes a novel, generative
                 approach that combines the strengths of QoS-enabled
                 pub/sub middleware with component-based middleware
                 technologies. In particular, this paper describes the
                 design and implementation of DDS4CIAO which addresses a
                 number of inherent and accidental complexities in the
                 DDS4CCM standard. DDS4CIAO simplifies the development,
                 deployment, and configuration of component-based DRE
                 systems that leverage DDS's powerful QoS capabilities
                 by provisioning DDS QoS policy settings and simplifying
                 the development of DDS applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Li:2012:GGP,
  author =       "Yulin Li and Gordon S. {Novak, Jr.}",
  title =        "Generation of geometric programs specified by
                 diagrams",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "63--72",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047874",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The GeoGram system [21] generates programs for
                 geometric computations by combining generic software
                 components as specified by diagrams constructed using a
                 graphical interface. The user specifies known and
                 desired quantities. As diagrams are constructed, the
                 system maintains symbolic geometric facts describing
                 the construction. Inferences based on the diagram are
                 used to derive new facts and to introduce new objects
                 based on geometric reasoning, to filter choices
                 presented to the user, to interpret the user's
                 intention in ambiguous cases, to detect
                 over-specification, and to generate the program. A
                 knowledge base of descriptions of generic software
                 components is used to prove that features of the
                 geometry can be computed from known values. These local
                 proofs are combined to guide generation of a program
                 that computes the desired values from inputs. The
                 library of generic geometric program components is used
                 to generate both in-line code and specialized
                 subroutines; partial evaluation improves the efficiency
                 of the generated code. The resulting program is
                 automatically translated into the desired language. The
                 program can also be run interactively to simulate the
                 geometry by generating graphical traces on the diagram
                 as input quantities are varied.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Steck:2012:MDE,
  author =       "Andreas Steck and Alex Lotz and Christian Schlegel",
  title =        "Model-driven engineering and run-time model-usage in
                 service robotics",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "73--82",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047875",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The development of service robots has gained more and
                 more attention over the last years. A major challenge
                 on the way towards industrial-strength service robotic
                 systems is to make the step from code-driven to
                 model-driven engineering. In this work we propose to
                 put models into the focus of the whole life-cycle of
                 robotic systems covering design-time as well as
                 run-time. We describe how to explicate parameters,
                 properties and resource information in the models at
                 design-time and how to take these information into
                 account by the run-time system of the robot to support
                 its decision making process. We underpin our work by an
                 exhaustive real-world example which is completely
                 developed with our tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Vermolen:2012:GDM,
  author =       "Sander Dani{\"e}l Vermolen and Guido Wachsmuth and
                 Eelco Visser",
  title =        "Generating database migrations for evolving {Web}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "83--92",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047876",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "WebDSL is a domain-specific language for the
                 implementation of dynamic web applications with a rich
                 data model. It provides developers with object-oriented
                 data modeling concepts but abstracts over
                 implementation details for persisting application data
                 in relational databases. When the underlying data model
                 of an application evolves, persisted application data
                 has to be migrated. While implementing migration at the
                 database level breaks the abstractions provided by
                 WebDSL, an implementation at the data model level
                 requires to intermingle migration with application
                 code. In this paper, we present a domain-specific
                 language for the coupled evolution of data models and
                 application data. It allows to specify data model
                 evolution as a separate concern at the data model level
                 and can be compiled to migration code at the database
                 level. Its linguistic integration with WebDSL enables
                 static checks for evolution validity and correctness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Danvy:2012:PFS,
  author =       "Olivier Danvy",
  title =        "Pragmatics for formal semantics",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "93--94",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047878",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This tech talk describes how to write and how to
                 inter-derive formal semantics for sequential
                 programming languages. The progress reported here is
                 (1) concrete guidelines to write each formal semantics
                 to alleviate their proof obligations, and (2) simple
                 calculational tools to obtain a formal semantics from
                 another.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Shubert:2012:AMB,
  author =       "Gary J. Shubert",
  title =        "Application of model based development to flexible
                 code generation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "95--96",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047880",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This address will present the authors views and
                 perspectives on the past, present and future use of
                 model based development techniques to enable the
                 automated generation of source code and other forms of
                 programming. This address will discuss past and present
                 use of model based development and automated code
                 generation at Lockheed Martin, with special emphasis on
                 NASA's Orion Multi-Purpose Crew Vehicle Program. This
                 address will discuss the advantages and disadvantages,
                 associated with the current state of the practice
                 techniques and tools, used to automatically generate
                 source code from general purpose and domain specific
                 models. This address will discuss the obstacles and
                 enablers, associated with achieving the desired future
                 state of complete and efficient automated generation of
                 programming through transformation of general purpose
                 and domain specific models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Asai:2012:RDS,
  author =       "Kenichi Asai",
  title =        "Reflection in direct style",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "97--106",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047882",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A reflective language enables us to access, inspect,
                 and/or modify the language semantics from within the
                 same language framework. Although the degree of
                 semantics exposure differs from one language to
                 another, the most powerful approach, referred to as the
                 behavioral reflection, exposes the entire language
                 semantics (or the language interpreter) that defines
                 behavior of user programs for user
                 inspection/modification. In this paper, we deal with
                 the behavioral reflection in the context of a
                 functional language Scheme. In particular, we show how
                 to construct a reflective interpreter where user
                 programs are interpreted by the tower of metacircular
                 interpreters and have the ability to change any parts
                 of the interpreters during execution. Its distinctive
                 feature compared to the previous work is that the
                 metalevel interpreters observed by users are written in
                 direct style. Based on the past attempt of the present
                 author, the current work solves the level-shifting
                 anomaly by defunctionalizing and inspecting the top of
                 the continuation frames. The resulting system enables
                 us to freely go up and down the levels and
                 access/modify the direct-style metalevel interpreter.
                 This is in contrast to the previous system where
                 metalevel interpreters were written in
                 continuation-passing style (CPS) and only CPS functions
                 could be exposed to users for modification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Nystrom:2012:FRT,
  author =       "Nathaniel Nystrom and Derek White and Kishen Das",
  title =        "{Firepile}: run-time compilation for {GPUs} in
                 {Scala}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "107--116",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047883",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent advances have enabled GPUs to be used as
                 general-purpose parallel processors on commodity
                 hardware for little cost. However, the ability to
                 program these devices has not kept up with their
                 performance. The programming model for GPUs has a
                 number of restrictions that make it difficult to
                 program. For example, software running on the GPU
                 cannot perform dynamic memory allocation, requiring the
                 programmer to pre-allocate all memory the GPU might
                 use. To achieve good performance, GPU programmers must
                 also be aware of how data is moved between host and GPU
                 memory and between the different levels of the GPU
                 memory hierarchy. We describe Firepile, a library for
                 GPU programming in Scala. The library enables a subset
                 of Scala to be executed on the GPU. Code trees can be
                 created from run-time function values, which can then
                 be analyzed and transformed to generate GPU code. A key
                 property of this mechanism is that it is modular:
                 unlike with other meta-programming constructs, the use
                 of code trees need not be exposed in the library
                 interface. Code trees are general and can be used by
                 library writers in other application domains. Our
                 experiments show Firepile users can achieve performance
                 comparable to C code targeted to the GPU with shorter,
                 simpler, and easier-to-understand code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Esmaeilsabzali:2012:MAC,
  author =       "Shahram Esmaeilsabzali and Bernd Fischer and Joanne M.
                 Atlee",
  title =        "Monitoring aspects for the customization of
                 automatically generated code for big-step models",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "117--126",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047884",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The output of a code generator is assumed to be
                 correct and not usually intended to be read or
                 modified; yet programmers are often interested in this,
                 e.g., to monitor a system property. Here, we consider
                 code customization for a family of code generators
                 associated with big-step executable modelling languages
                 (e.g., statecharts). We introduce a customization
                 language that allows us to express customization
                 scenarios for the generated code independently of a
                 specific big-step execution semantics. These
                 customization scenarios are all different forms of
                 runtime monitors, which lend themselves to a
                 principled, uniform implementation for observation and
                 code extension. A monitor is given in terms of the
                 enabledness and execution of the transitions of a model
                 and a reachability relation between two states of the
                 execution of the model during a big step. For each
                 monitor, we generate the aspect code that is
                 incorporated into the output of a code generator to
                 implement the monitor at the generated-code level.
                 Thus, we provide means for code analysis through using
                 the vocabulary of a model, rather than the detail of
                 the generated code. Our technique not only requires the
                 code generators to reveal only limited information
                 about their code generation mechanisms, but also keeps
                 the structure of the generated code intact. We
                 demonstrate how various useful properties of a model,
                 or a language, can be checked using our monitors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Lindeman:2012:DDD,
  author =       "Ricky T. Lindeman and Lennart C. L. Kats and Eelco
                 Visser",
  title =        "Declaratively defining domain-specific language
                 debuggers",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "127--136",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047885",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Tool support is vital to the effectiveness of
                 domain-specific languages. With language workbenches,
                 domain-specific languages and their tool support can be
                 generated from a combined, high-level specification.
                 This paper shows how such a specification can be
                 extended to describe a debugger for a language. To
                 realize this, we introduce a meta-language for
                 coordinating the debugger that abstracts over the
                 complexity of writing a debugger by hand. We describe
                 the implementation of a language-parametric
                 infrastructure for debuggers that can be instantiated
                 based on this specification. The approach is
                 implemented in the Spoofax language workbench and
                 validated through realistic case studies with the
                 Stratego transformation language and the WebDSL web
                 programming language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Arnoldus:2012:LMU,
  author =       "B. J. Arnoldus and M. G. J. van den Brand and A.
                 Serebrenik",
  title =        "Less is more: unparser-completeness of metalanguages
                 for template engines",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "137--146",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047887",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A code generator is a program translating an input
                 model into code. In this paper we focus on
                 template-based code generators in the context of the
                 model view controller architecture (MVC). The language
                 in which the code generator is written is known as a
                 metalanguage in the code generation parlance. The
                 metalanguage should be, on the one side, expressive
                 enough to be of practical value, and, on the other
                 side, restricted enough to enforce the separation
                 between the view and the model, according to the MVC.
                 In this paper we advocate the notion of
                 unparser-complete metalanguages as providing the right
                 level of expressivity. An unparser-complete
                 metalanguage is capable of expressing an unparser, a
                 code generator that translates any legal abstract
                 syntax tree into an equivalent sentence of the
                 corresponding context-free language. A metalanguage not
                 able to express an unparser will fail to produce all
                 sentences belonging to the corresponding context-free
                 language. A metalanguage able to express more than an
                 unparser will also be able to implement code violating
                 the model/view separation. We further show that a
                 metalanguage with the power of a linear deterministic
                 tree-to-string transducer is unparser-complete.
                 Moreover, this metalanguage has been successfully
                 applied in a non-trivial case study where an existing
                 code generator is refactored using templates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Slaatten:2012:TAG,
  author =       "Vidar Sl{\aa}tten and Frank Alexander Kraemer and
                 Peter Herrmann",
  title =        "Towards automatic generation of formal specifications
                 to validate and verify reliable distributed systems: a
                 method exemplified by an industrial case study",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "147--156",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047888",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The validation and verification of reliable systems is
                 a difficult and complex task, mainly for two reasons:
                 First, it is difficult to precisely state which formal
                 properties a system needs to fulfil to be of high
                 quality. Second, it is complex to automatically verify
                 such properties, due to the size of the analysis state
                 space which grows exponentially with the number of
                 components. We tackle these problems by a
                 tool-supported method which embeds application
                 functionality in building blocks that use UML
                 activities to describe their internal behaviour. To
                 describe their externally visible behaviour, we use a
                 combination of complementary interface contracts,
                 so-called ESMs and EESMs. In this paper, we present an
                 extension of the interface contracts, External
                 Reliability Contracts (ERCs), that capture failure
                 behaviour. This separation of different behavioural
                 aspects in separate descriptions facilitates a two-step
                 analysis, in which the first step is completely
                 automated and the second step is facilitated by an
                 automatic translation of the models to the input syntax
                 of the model checker TLC. Further, the cascade of
                 contracts is used to separate the work of domain and
                 reliability experts. The concepts are proposed with the
                 background of a real industry case, and we demonstrate
                 how the use of interface contracts leads to
                 significantly smaller state spaces in the analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Sobernig:2012:CCA,
  author =       "Stefan Sobernig and Patrick Gaubatz and Mark Strembeck
                 and Uwe Zdun",
  title =        "Comparing complexity of {API} designs: an exploratory
                 experiment on {DSL}-based framework integration",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "157--166",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047890",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Embedded, textual DSLs are often provided as an API
                 wrapped around object-oriented application frameworks
                 to ease framework integration. While literature
                 presents claims that DSL-based application development
                 is beneficial, empirical evidence for this is rare. We
                 present the results of an experiment comparing the
                 complexity of three different object-oriented framework
                 APIs and an embedded, textual DSL. For this comparative
                 experiment, we implemented the same, non-trivial
                 application scenario using these four different APIs.
                 Then, we performed an Object-Points (OP) analysis,
                 yielding indicators for the API complexity specific to
                 each API variant. The main observation for our
                 experiment is that the embedded, textual DSL incurs the
                 smallest API complexity. Although the results are
                 exploratory, as well as limited to the given
                 application scenario and a single embedded DSL, our
                 findings can direct future empirical work. The
                 experiment design is applicable for similar API design
                 evaluations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Erdweg:2012:GLE,
  author =       "Sebastian Erdweg and Lennart C. L. Kats and Tillmann
                 Rendel and Christian K{\"a}stner and Klaus Ostermann
                 and Eelco Visser",
  title =        "Growing a language environment with editor libraries",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "167--176",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047891",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large software projects consist of code written in a
                 multitude of different (possibly domain-specific)
                 languages, which are often deeply interspersed even in
                 single files. While many proposals exist on how to
                 integrate languages semantically and syntactically, the
                 question of how to support this scenario in integrated
                 development environments (IDEs) remains open: How can
                 standard IDE services, such as syntax highlighting,
                 outlining, or reference resolving, be provided in an
                 extensible and compositional way, such that an open mix
                 of languages is supported in a single file? Based on
                 our library-based syntactic extension language for
                 Java, SugarJ, we propose to make IDEs extensible by
                 organizing editor services in editor libraries. Editor
                 libraries are libraries written in the object language,
                 SugarJ, and hence activated and composed through
                 regular import statements on a file-by-file basis. We
                 have implemented an IDE for editor libraries on top of
                 SugarJ and the Eclipse-based Spoofax language
                 workbench. We have validated editor libraries by
                 evolving this IDE into a fully-fledged and schema-aware
                 XML editor as well as an extensible Latex editor, which
                 we used for writing this paper.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Freeman:2012:HPH,
  author =       "John Freeman and Jaakko J{\"a}rvi and Wonseok Kim and
                 Mat Marcus and Sean Parent",
  title =        "Helping programmers help users",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "177--184",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047892",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "User interfaces exhibit a wide range of features that
                 are designed to assist users. Interaction with one
                 widget may trigger value changes, disabling, or other
                 behaviors in other widgets. Such automatic behavior may
                 be confusing or disruptive to users. Research
                 literature on user interfaces offers a number of
                 solutions, including interface features for explaining
                 or controlling these behaviors. To help programmers
                 help users, the implementation costs of these features
                 need to be much lower. Ideally, they could be generated
                 for free. This paper shows how several help and control
                 mechanisms can be implemented as algorithms and reused
                 across interfaces, making the cost of their adoption
                 negligible. Specifically, we describe generic help
                 mechanisms for visualizing data flow and explaining
                 command deactivation, and a mechanism for controlling
                 the flow of data. A reusable implementation of these
                 features is enabled by our property model framework,
                 where the data manipulated through a user interface is
                 modeled as a constraint system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Launchbury:2012:TBC,
  author =       "John Launchbury",
  title =        "Theorem-based circuit derivation in {Cryptol}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "3",
  pages =        "185--186",
  month =        mar,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2189751.2047894",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:00 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Even though step-by-step refinement has long been seen
                 as desirable, it is hard to find compelling industrial
                 applications of the technique. In theory, transforming
                 a high-level specification into a high-performance
                 implementation is an ideal means of producing a correct
                 design, but in practice it is hard to make it work, and
                 even harder to make it worthwhile. This talk describes
                 an exception. We introduce the domain-specific
                 language, Cryptol, and work up to a design experience
                 in which theorem-based refinement played a crucial role
                 in producing an industrial quality FPGA encryptor and
                 decryptor for AES. Quite simply, we are unlikely to
                 have succeeded without the technique. The Cryptol
                 specification language was designed by Galois for the
                 NSA as a public standard for specifying cryptographic
                 algorithms. A Cryptol reference specification can serve
                 as the formal documentation for a cryptographic module,
                 eliminating the need for separate and voluminous
                 English descriptions. Cryptol is fully executable,
                 allowing designers to experiment with their programs
                 incrementally as their designs evolve. Cryptol
                 compilers can generate C, C++, and Haskell software
                 implementations, and VHDL or Verilog HDL hardware
                 implementations. These generators can significantly
                 reduce overall life-cycle costs of cryptographic
                 solutions. For example, Cryptol allows engineers and
                 mathematicians to program cryptographic algorithms on
                 FPGAs as if they were writing software. The design
                 experience we describe runs as follows: we begin with a
                 specification for AES written in Cryptol, and over a
                 series of five design stages we produce an industrial
                 grade encrypt core. In each stage, we state theorems
                 which relate the component behaviors in one stage with
                 the corresponding behaviors in the refinement. The
                 resulting cores, running at 350Mhz-440Mhz depending on
                 the FPGA part, bear little relationship to the
                 original, except that the step-by-step theorems ensured
                 we had not gone astray. We then repeat the pattern in
                 generating a circuit for AES decrypt. While there are
                 many similarities between encrypt and decrypt in AES,
                 there are some crucial differences with regard to high
                 performance. First concerns the generation of key
                 material. The AES key is used as a seed for a specific
                 pseudo-random number generator which produces key
                 material for use in each of the AES rounds. For
                 encrypt, the key-generator runs in sync with the action
                 of encryption, so may be scheduled alongside it. For
                 decrypt, they run counter to one-another, creating a
                 major challenge to be overcome. Second, the generated
                 key material has an additional transformation applied
                 to it, which occurs deep in the middle of the high
                 performing core. Using theorems as stepping stones
                 along the way, we redesign the key expansion algorithm
                 so that it will run in sync with the decryption. We
                 then trace parallel steps to the derivation of encrypt,
                 establishing a series of commuting diagrams along the
                 way. Whenever we confronted bugs in the development
                 process, we produced many theorems to isolate the bugs,
                 using theorems as a principled kind of printf. When the
                 bugs were found and eradicated, we elided many of the
                 temporary theorems, leaving behind those that provided
                 important insights into the behavior of the code. This
                 talk is a story of the journey with demonstrations of
                 the tool at work. Its ultimate message is to highlight
                 the value of including a theorem facility within purely
                 functional domain-specific languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GCPE '11 conference proceedings.",
}

@Article{Larus:2012:CWC,
  author =       "James R. Larus",
  title =        "The cloud will change everything",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "1--2",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950367",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cloud computing is fast on its way to becoming a
                 meaningless, oversold marketing slogan. In the midst of
                 this hype, it is easy to overlook the fundamental
                 change that is occurring. Computation, which used to be
                 confined to the machine beside your desk, is
                 increasingly centralized in vast shared facilities and
                 at the same time liberated by battery-powered, wireless
                 devices. Performance, security, and reliability are no
                 longer problems that can be considered in isolation ---
                 the wires and software connecting pieces offer more
                 challenges and opportunities than components
                 themselves. The eXtreme Computing Group (XCG) in
                 Microsoft Research is taking a holistic approach to
                 research in this area, by bring together researchers
                 and developers with expertise in data center design,
                 computer architecture, operating systems, computer
                 security, programming language, mobile computation, and
                 user interfaces to tackle the challenges of cloud
                 computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Yuan:2012:ISD,
  author =       "Ding Yuan and Jing Zheng and Soyeon Park and Yuanyuan
                 Zhou and Stefan Savage",
  title =        "Improving software diagnosability via log
                 enhancement",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "3--14",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950369",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Diagnosing software failures in the field is
                 notoriously difficult, in part due to the fundamental
                 complexity of trouble-shooting any complex software
                 system, but further exacerbated by the paucity of
                 information that is typically available in the
                 production setting. Indeed, for reasons of both
                 overhead and privacy, it is common that only the
                 run-time log generated by a system (e.g., syslog) can
                 be shared with the developers. Unfortunately, the
                 ad-hoc nature of such reports are frequently
                 insufficient for detailed failure diagnosis. This paper
                 seeks to improve this situation within the rubric of
                 existing practice. We describe a tool, LogEnhancer that
                 automatically ``enhances'' existing logging code to aid
                 in future post-failure debugging. We evaluate
                 LogEnhancer on eight large, real-world applications and
                 demonstrate that it can dramatically reduce the set of
                 potential root failure causes that must be considered
                 during diagnosis while imposing negligible overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Veeraraghavan:2012:DPS,
  author =       "Kaushik Veeraraghavan and Dongyoon Lee and Benjamin
                 Wester and Jessica Ouyang and Peter M. Chen and Jason
                 Flinn and Satish Narayanasamy",
  title =        "{DoublePlay}: parallelizing sequential logging and
                 replay",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "15--26",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950370",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Deterministic replay systems record and reproduce the
                 execution of a hardware or software system. In contrast
                 to replaying execution on uniprocessors, deterministic
                 replay on multiprocessors is very challenging to
                 implement efficiently because of the need to reproduce
                 the order or values read by shared memory operations
                 performed by multiple threads. In this paper, we
                 present DoublePlay, a new way to efficiently guarantee
                 replay on commodity multiprocessors. Our key insight is
                 that one can use the simpler and faster mechanisms of
                 single-processor record and replay, yet still achieve
                 the scalability offered by multiple cores, by using an
                 additional execution to parallelize the record and
                 replay of an application. DoublePlay timeslices
                 multiple threads on a single processor, then runs
                 multiple time intervals (epochs) of the program
                 concurrently on separate processors. This strategy,
                 which we call uniparallelism, makes logging much easier
                 because each epoch runs on a single processor (so
                 threads in an epoch never simultaneously access the
                 same memory) and different epochs operate on different
                 copies of the memory. Thus, rather than logging the
                 order of shared-memory accesses, we need only log the
                 order in which threads in an epoch are timesliced on
                 the processor. DoublePlay runs an additional execution
                 of the program on multiple processors to generate
                 checkpoints so that epochs run in parallel. We evaluate
                 DoublePlay on a variety of client, server, and
                 scientific parallel benchmarks; with spare cores,
                 DoublePlay reduces logging overhead to an average of
                 15\% with two worker threads and 28\% with four
                 threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Casper:2012:HAT,
  author =       "Jared Casper and Tayo Oguntebi and Sungpack Hong and
                 Nathan G. Bronson and Christos Kozyrakis and Kunle
                 Olukotun",
  title =        "Hardware acceleration of transactional memory on
                 commodity systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "27--38",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950372",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The adoption of transactional memory is hindered by
                 the high overhead of software transactional memory and
                 the intrusive design changes required by previously
                 proposed TM hardware. We propose that hardware to
                 accelerate software transactional memory (STM) can
                 reside outside an unmodified commodity processor core,
                 thereby substantially reducing implementation costs.
                 This paper introduces Transactional Memory Acceleration
                 using Commodity Cores (TMACC), a hardware-accelerated
                 TM system that does not modify the processor, caches,
                 or coherence protocol. We present a complete hardware
                 implementation of TMACC using a rapid prototyping
                 platform. Using this hardware, we implement two unique
                 conflict detection schemes which are accelerated using
                 Bloom filters on an FPGA. These schemes employ novel
                 techniques for tolerating the latency of fine-grained
                 asynchronous communication with an out-of-core
                 accelerator. We then conduct experiments to explore the
                 feasibility of accelerating TM without modifying
                 existing system hardware. We show that, for all but
                 short transactions, it is not necessary to modify the
                 processor to obtain substantial improvement in TM
                 performance. In these cases, TMACC outperforms an STM
                 by an average of 69\% in applications using
                 moderate-length transactions, showing maximum speedup
                 within 8\% of an upper bound on TM acceleration.
                 Overall, we demonstrate that hardware can substantially
                 accelerate the performance of an STM on unmodified
                 commodity processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Dalessandro:2012:HNC,
  author =       "Luke Dalessandro and Fran{\c{c}}ois Carouge and Sean
                 White and Yossi Lev and Mark Moir and Michael L. Scott
                 and Michael F. Spear",
  title =        "Hybrid {NOrec}: a case study in the effectiveness of
                 best effort hardware transactional memory",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "39--52",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950373",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactional memory (TM) is a promising
                 synchronization mechanism for the next generation of
                 multicore processors. Best-effort Hardware
                 Transactional Memory (HTM) designs, such as Sun's
                 prototype Rock processor and AMD's proposed Advanced
                 Synchronization Facility (ASF), can efficiently execute
                 many transactions, but abort in some cases due to
                 various limitations. Hybrid TM systems can use a
                 compatible software TM (STM) in such cases. We
                 introduce a family of hybrid TMs built using the recent
                 NOrec STM algorithm that, unlike existing hybrid
                 approaches, provide both low overhead on hardware
                 transactions and concurrent execution of hardware and
                 software transactions. We evaluate implementations for
                 Rock and ASF, exploring how the differing HTM designs
                 affect optimization choices. Our investigation yields
                 valuable input for designers of future best-effort
                 HTMs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Singh:2012:EPS,
  author =       "Abhayendra Singh and Daniel Marino and Satish
                 Narayanasamy and Todd Millstein and Madan Musuvathi",
  title =        "Efficient processor support for {DRFx}, a memory model
                 with exceptions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "53--66",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950375",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A longstanding challenge of shared-memory concurrency
                 is to provide a memory model that allows for efficient
                 implementation while providing strong and simple
                 guarantees to programmers. The C++0x and Java memory
                 models admit a wide variety of compiler and hardware
                 optimizations and provide sequentially consistent (SC)
                 semantics for data-race-free programs. However, they
                 either do not provide any semantics (C++0x) or provide
                 a hard-to-understand semantics (Java) for racy
                 programs, compromising the safety and debuggability of
                 such programs. In earlier work we proposed the DRFx
                 memory model, which addresses this problem by
                 dynamically detecting potential violations of SC due to
                 the interaction of compiler or hardware optimizations
                 with data races and halting execution upon detection.
                 In this paper, we present a detailed micro-architecture
                 design for supporting the DRFx memory model, formalize
                 the design and prove its correctness, and evaluate the
                 design using a hardware simulator. We describe a set of
                 DRFx-compliant complexity-effective optimizations which
                 allow us to attain performance close to that of TSO
                 (Total Store Model) and DRF0 while providing strong
                 guarantees for all programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Devietti:2012:RRC,
  author =       "Joseph Devietti and Jacob Nelson and Tom Bergan and
                 Luis Ceze and Dan Grossman",
  title =        "{RCDC}: a relaxed consistency deterministic computer",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "67--78",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950376",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Providing deterministic execution significantly
                 simplifies the debugging, testing, replication, and
                 deployment of multithreaded programs. Recent work has
                 developed deterministic multiprocessor architectures as
                 well as compiler and runtime systems that enforce
                 determinism in current hardware. Such work has
                 incidentally imposed strong memory-ordering properties.
                 Historically, memory ordering has been relaxed in favor
                 of higher performance in shared memory multiprocessors
                 and, interestingly, determinism exacerbates the cost of
                 strong memory ordering. Consequently, we argue that
                 relaxed memory ordering is vital to achieving faster
                 deterministic execution. This paper introduces RCDC, a
                 deterministic multiprocessor architecture that takes
                 advantage of relaxed memory orderings to provide
                 high-performance deterministic execution with low
                 hardware complexity. RCDC has two key innovations: a
                 hybrid HW/SW approach to enforcing determinism; and a
                 new deterministic execution strategy that leverages
                 data-race-free-based memory models (e.g., the models
                 for Java and C++) to improve performance and
                 scalability without sacrificing determinism, even in
                 the presence of races. In our hybrid HW/SW approach,
                 the only hardware mechanisms required are
                 software-controlled store buffering and support for
                 precise instruction counting; we do not require
                 speculation. A runtime system uses these mechanisms to
                 enforce determinism for arbitrary programs. We evaluate
                 RCDC using PARSEC benchmarks and show that relaxing
                 memory ordering leads to performance and scalability
                 close to nondeterministic execution without requiring
                 any form of speculation. We also compare our new
                 execution strategy to one based on TSO
                 (total-store-ordering) and show that some applications
                 benefit significantly from the extra relaxation. We
                 also evaluate a software-only implementation of our new
                 deterministic execution strategy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Burnim:2012:SCS,
  author =       "Jacob Burnim and George Necula and Koushik Sen",
  title =        "Specifying and checking semantic atomicity for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "79--90",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950377",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In practice, it is quite difficult to write correct
                 multithreaded programs due to the potential for
                 unintended and nondeterministic interference between
                 parallel threads. A fundamental correctness property
                 for such programs is atomicity---a block of code in a
                 program is atomic if, for any parallel execution of the
                 program, there is an execution with the same overall
                 program behavior in which the block is executed
                 serially. We propose semantic atomicity, a
                 generalization of atomicity with respect to a
                 programmer-defined notion of equivalent behavior. We
                 propose an assertion framework in which a programmer
                 can use bridge predicates to specify noninterference
                 properties at the level of abstraction of their
                 application. Further, we propose a novel algorithm for
                 systematically testing atomicity specifications on
                 parallel executions with a bounded number of
                 interruptions---i.e. atomic blocks whose execution is
                 interleaved with that of other threads. We further
                 propose a set of sound heuristics and optional user
                 annotations that increase the efficiency of checking
                 atomicity specifications in the common case where the
                 specifications hold. We have implemented our assertion
                 framework for specifying and checking semantic
                 atomicity for parallel Java programs, and we have
                 written semantic atomicity specifications for a number
                 of benchmarks. We found that using bridge predicates
                 allowed us to specify the natural and intended atomic
                 behavior of a wider range of programs than did previous
                 approaches. Further, in checking our specifications, we
                 found several previously unknown bugs, including in the
                 widely-used java.util.concurrent library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Volos:2012:MLP,
  author =       "Haris Volos and Andres Jaan Tack and Michael M.
                 Swift",
  title =        "{Mnemosyne}: lightweight persistent memory",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "91--104",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950379",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "New storage-class memory (SCM) technologies, such as
                 phase-change memory, STT-RAM, and memristors, promise
                 user-level access to non-volatile storage through
                 regular memory instructions. These memory devices
                 enable fast user-mode access to persistence, allowing
                 regular in-memory data structures to survive system
                 crashes. In this paper, we present Mnemosyne, a simple
                 interface for programming with persistent memory.
                 Mnemosyne addresses two challenges: how to create and
                 manage such memory, and how to ensure consistency in
                 the presence of failures. Without additional
                 mechanisms, a system failure may leave data structures
                 in SCM in an invalid state, crashing the program the
                 next time it starts. In Mnemosyne, programmers declare
                 global persistent data with the keyword ``pstatic'' or
                 allocate it dynamically. Mnemosyne provides primitives
                 for directly modifying persistent variables and
                 supports consistent updates through a lightweight
                 transaction mechanism. Compared to past work on
                 disk-based persistent memory, Mnemosyne reduces latency
                 to storage by writing data directly to memory at the
                 granularity of an update rather than writing memory
                 pages back to disk through the file system. In tests
                 emulating the performance characteristics of
                 forthcoming SCMs, we show that Mnemosyne can persist
                 data as fast as 3 microseconds. Furthermore, it
                 provides a 35 percent performance increase when applied
                 in the OpenLDAP directory server. In microbenchmark
                 studies we find that Mnemosyne can be up to 1400\%
                 faster than alternative persistence strategies, such as
                 Berkeley DB or Boost serialization, that are designed
                 for disks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Coburn:2012:NHM,
  author =       "Joel Coburn and Adrian M. Caulfield and Ameen Akel and
                 Laura M. Grupp and Rajesh K. Gupta and Ranjit Jhala and
                 Steven Swanson",
  title =        "{NV-Heaps}: making persistent objects fast and safe
                 with next-generation, non-volatile memories",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "105--118",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950380",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Persistent, user-defined objects present an attractive
                 abstraction for working with non-volatile program
                 state. However, the slow speed of persistent storage
                 (i.e., disk) has restricted their design and limited
                 their performance. Fast, byte-addressable, non-volatile
                 technologies, such as phase change memory, will remove
                 this constraint and allow programmers to build
                 high-performance, persistent data structures in
                 non-volatile storage that is almost as fast as DRAM.
                 Creating these data structures requires a system that
                 is lightweight enough to expose the performance of the
                 underlying memories but also ensures safety in the
                 presence of application and system failures by avoiding
                 familiar bugs such as dangling pointers, multiple
                 free()s, and locking errors. In addition, the system
                 must prevent new types of hard-to-find pointer safety
                 bugs that only arise with persistent objects. These
                 bugs are especially dangerous since any corruption they
                 cause will be permanent. We have implemented a
                 lightweight, high-performance persistent object system
                 called NV-heaps that provides transactional semantics
                 while preventing these errors and providing a model for
                 persistence that is easy to use and reason about. We
                 implement search trees, hash tables, sparse graphs, and
                 arrays using NV-heaps, BerkeleyDB, and Stasis. Our
                 results show that NV-heap performance scales with
                 thread count and that data structures implemented using
                 NV-heaps out-perform BerkeleyDB and Stasis
                 implementations by 32x and 244x, respectively, by
                 avoiding the operating system and minimizing other
                 software overheads. We also quantify the cost of
                 enforcing the safety guarantees that NV-heaps provide
                 and measure the costs of NV-heap primitive
                 operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Schupbach:2012:DLA,
  author =       "Adrian Sch{\"u}pbach and Andrew Baumann and Timothy
                 Roscoe and Simon Peter",
  title =        "A declarative language approach to device
                 configuration",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "119--132",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950382",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "C remains the language of choice for hardware
                 programming (device drivers, bus configuration, etc.):
                 it is fast, allows low-level access, and is trusted by
                 OS developers. However, the algorithms required to
                 configure and reconfigure hardware devices and
                 interconnects are becoming more complex and diverse,
                 with the added burden of legacy support, quirks, and
                 hardware bugs to work around. Even programming PCI
                 bridges in a modern PC is a surprisingly complex
                 problem, and is getting worse as new functionality such
                 as hotplug appears. Existing approaches use relatively
                 simple algorithms, hard-coded in C and closely coupled
                 with low-level register access code, generally leading
                 to suboptimal configurations. We investigate the merits
                 and drawbacks of a new approach: separating hardware
                 configuration logic (algorithms to determine
                 configuration parameter values) from mechanism
                 (programming device registers). The latter we keep in
                 C, and the former we encode in a declarative
                 programming language with constraint-satisfaction
                 extensions. As a test case, we have implemented full
                 PCI configuration, resource allocation, and interrupt
                 assignment in the Barrelfish research operating system,
                 using a concise expression of efficient algorithms in
                 constraint logic programming. We show that the approach
                 is tractable, and can successfully configure a wide
                 range of PCs with competitive runtime cost. Moreover,
                 it requires about half the code of the C-based approach
                 in Linux while offering considerably more
                 functionality. Additionally it easily accommodates
                 adaptations such as hotplug, fixed regions, and
                 quirks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Ryzhyk:2012:IDD,
  author =       "Leonid Ryzhyk and John Keys and Balachandra Mirla and
                 Arun Raghunath and Mona Vij and Gernot Heiser",
  title =        "Improved device driver reliability through hardware
                 verification reuse",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "133--144",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950383",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Faulty device drivers are a major source of operating
                 system failures. We argue that the underlying cause of
                 many driver faults is the separation of two
                 highly-related tasks: device verification and driver
                 development. These two tasks have a lot in common, and
                 result in software that is conceptually and
                 functionally similar, yet kept totally separate. The
                 result is a particularly bad case of duplication of
                 effort: the verification code is correct, but is
                 discarded after the device has been manufactured; the
                 driver code is inferior, but used in actual device
                 operation. We claim that the two tasks, and the
                 software they produce, can and should be unified, and
                 this will result in drastic improvement of
                 device-driver quality and reduction in the development
                 cost and time to market. In this paper we propose a
                 device driver design and verification workflow that
                 achieves such unification. We apply this workflow to
                 develop and test drivers for four different I/O devices
                 and demonstrate that it improves the driver test
                 coverage and allows detecting driver defects that are
                 extremely hard to find using conventional testing
                 techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Hashmi:2012:CNI,
  author =       "Atif Hashmi and Andrew Nere and James Jamal Thomas and
                 Mikko Lipasti",
  title =        "A case for neuromorphic {ISAs}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "145--158",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950385",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The desire to create novel computing systems, paired
                 with recent advances in neuroscientific understanding
                 of the brain, has led researchers to develop
                 neuromorphic architectures that emulate the brain. To
                 date, such models are developed, trained, and deployed
                 on the same substrate. However, excessive co-dependence
                 between the substrate and the algorithm prevents
                 portability, or at the very least requires
                 reconstructing and retraining the model whenever the
                 substrate changes. This paper proposes a well-defined
                 abstraction layer --- the Neuromorphic instruction set
                 architecture, or NISA --- that separates a neural
                 application's algorithmic specification from the
                 underlying execution substrate, and describes the Aivo
                 framework, which demonstrates the concrete advantages
                 of such an abstraction layer. Aivo consists of a NISA
                 implementation for a rate-encoded neuromorphic system
                 based on the cortical column abstraction, a
                 state-of-the-art integrated development and runtime
                 environment (IDE), and various profile-based
                 optimization tools. Aivo's IDE generates code for
                 emulating cortical networks on the host CPU, multiple
                 GPGPUs, or as boolean functions. Its runtime system can
                 deploy and adaptively optimize cortical networks in a
                 manner similar to conventional just-in-time compilers
                 in managed runtime systems (e.g. Java, C\#). We
                 demonstrate the abilities of the NISA abstraction by
                 constructing a cortical network model of the mammalian
                 visual cortex, deploying on multiple execution
                 substrates, and utilizing the various optimization
                 tools we have created. For this hierarchical
                 configuration, Aivo's profiling based network
                 optimization tools reduce the memory footprint by 50\%
                 and improve the execution time by a factor of 3x on the
                 host CPU. Deploying the same network on a single GPGPU
                 results in a 30x speedup. We further demonstrate that a
                 speedup of 480x can be achieved by deploying a
                 massively scaled cortical network across three GPGPUs.
                 Finally, converting a trained hierarchical network to
                 C/C++ boolean constructs on the host CPU results in 44x
                 speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Ransford:2012:MSS,
  author =       "Benjamin Ransford and Jacob Sorber and Kevin Fu",
  title =        "{Mementos}: system support for long-running
                 computation on {RFID}-scale devices",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "159--170",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950386",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transiently powered computing devices such as RFID
                 tags, kinetic energy harvesters, and smart cards
                 typically rely on programs that complete a task under
                 tight time constraints before energy starvation leads
                 to complete loss of volatile memory. Mementos is a
                 software system that transforms general-purpose
                 programs into interruptible computations that are
                 protected from frequent power losses by automatic,
                 energy-aware state checkpointing. Mementos comprises a
                 collection of optimization passes for the LLVM compiler
                 infrastructure and a linkable library that exercises
                 hardware support for energy measurement while managing
                 state checkpoints stored in nonvolatile memory. We
                 evaluate Mementos against diverse test cases in a
                 trace-driven simulator of transiently powered
                 RFID-scale devices. Although Mementos's energy checks
                 increase run time when energy is plentiful, they allow
                 Mementos to safely suspend execution when energy
                 dwindles, effectively spreading computation across zero
                 or more power failures. This paper's contributions are:
                 a study of the runtime environment for programs on
                 RFID-scale devices; an energy-aware state checkpointing
                 system for these devices that is implemented for the
                 MSP430 family of microcontrollers; and a trace-driven
                 simulator of transiently powered RFID-scale devices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Koukoumidis:2012:PC,
  author =       "Emmanouil Koukoumidis and Dimitrios Lymberopoulos and
                 Karin Strauss and Jie Liu and Doug Burger",
  title =        "Pocket cloudlets",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "171--184",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950387",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cloud services accessed through mobile devices suffer
                 from high network access latencies and are constrained
                 by energy budgets dictated by the devices' batteries.
                 Radio and battery technologies will improve over time,
                 but are still expected to be the bottlenecks in future
                 systems. Non-volatile memories (NVM), however, may
                 continue experiencing significant and steady
                 improvements in density for at least ten more years. In
                 this paper, we propose to leverage the abundance in
                 memory capacity of mobile devices to mitigate latency
                 and energy issues when accessing cloud services. We
                 first analyze NVM technology scaling trends, and then
                 propose a cloud service cache architecture that resides
                 on the mobile device's NVM (pocket cloudlet). This
                 architecture utilizes both individual user and
                 community access models to maximize its hit rate, and
                 subsequently reduce overall service latency and energy
                 consumption. As a showcase we present the design,
                 implementation and evaluation of PocketSearch, a search
                 and advertisement pocket cloudlet. We perform mobile
                 search characterization to guide the design of
                 PocketSearch and evaluate it with 200 million mobile
                 queries from the search logs of m.bing.com. We show
                 that PocketSearch can serve, on average, 66\% of the
                 web search queries submitted by an individual user
                 without having to use the slow 3G link, leading to 16x
                 service access speedup. Finally, based on experience
                 with PocketSearch we provide additional insight and
                 guidelines on how future pocket cloudlets should be
                 organized, from both an architectural and an operating
                 system perspective.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Sharma:2012:BMS,
  author =       "Navin Sharma and Sean Barker and David Irwin and
                 Prashant Shenoy",
  title =        "{Blink}: managing server clusters on intermittent
                 power",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "185--198",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950389",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reducing the energy footprint of data centers
                 continues to receive significant attention due to both
                 its financial and environmental impact. There are
                 numerous methods that limit the impact of both factors,
                 such as expanding the use of renewable energy or
                 participating in automated demand-response programs. To
                 take advantage of these methods, servers and
                 applications must gracefully handle intermittent
                 constraints in their power supply. In this paper, we
                 propose blinking---metered transitions between a
                 high-power active state and a low-power inactive
                 state---as the primary abstraction for conforming to
                 intermittent power constraints. We design Blink, an
                 application-independent hardware-software platform for
                 developing and evaluating blinking applications, and
                 define multiple types of blinking policies. We then use
                 Blink to design BlinkCache, a blinking version of
                 memcached, to demonstrate the effect of blinking on an
                 example application. Our results show that a
                 load-proportional blinking policy combines the
                 advantages of both activation and synchronous blinking
                 for realistic Zipf-like popularity distributions and
                 wind/solar power signals by achieving near optimal hit
                 rates (within 15\% of an activation policy), while also
                 providing fairer access to the cache (within 2\% of a
                 synchronous policy) for equally popular objects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Hoffmann:2012:DKR,
  author =       "Henry Hoffmann and Stelios Sidiroglou and Michael
                 Carbin and Sasa Misailovic and Anant Agarwal and Martin
                 Rinard",
  title =        "Dynamic knobs for responsive power-aware computing",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "199--212",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950390",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present PowerDial, a system for dynamically
                 adapting application behavior to execute successfully
                 in the face of load and power fluctuations. PowerDial
                 transforms static configuration parameters into dynamic
                 knobs that the PowerDial control system can manipulate
                 to dynamically trade off the accuracy of the
                 computation in return for reductions in the
                 computational resources that the application requires
                 to produce its results. These reductions translate
                 directly into performance improvements and power
                 savings. Our experimental results show that PowerDial
                 can enable our benchmark applications to execute
                 responsively in the face of power caps that would
                 otherwise significantly impair responsiveness. They
                 also show that PowerDial can significantly reduce the
                 number of machines required to service intermittent
                 load spikes, enabling reductions in power and capital
                 costs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Liu:2012:FSD,
  author =       "Song Liu and Karthik Pattabiraman and Thomas
                 Moscibroda and Benjamin G. Zorn",
  title =        "{Flikker}: saving {DRAM} refresh-power through
                 critical data partitioning",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "213--224",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950391",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy has become a first-class design constraint in
                 computer systems. Memory is a significant contributor
                 to total system power. This paper introduces Flikker,
                 an application-level technique to reduce refresh power
                 in DRAM memories. Flikker enables developers to specify
                 critical and non-critical data in programs and the
                 runtime system allocates this data in separate parts of
                 memory. The portion of memory containing critical data
                 is refreshed at the regular refresh-rate, while the
                 portion containing non-critical data is refreshed at
                 substantially lower rates. This partitioning saves
                 energy at the cost of a modest increase in data
                 corruption in the non-critical data. Flikker thus
                 exposes and leverages an interesting trade-off between
                 energy consumption and hardware correctness. We show
                 that many applications are naturally tolerant to errors
                 in the non-critical data, and in the vast majority of
                 cases, the errors have little or no impact on the
                 application's final outcome. We also find that Flikker
                 can save between 20-25\% of the power consumed by the
                 memory sub-system in a mobile device, with negligible
                 impact on application performance. Flikker is
                 implemented almost entirely in software, and requires
                 only modest changes to the hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Deng:2012:MAL,
  author =       "Qingyuan Deng and David Meisner and Luiz Ramos and
                 Thomas F. Wenisch and Ricardo Bianchini",
  title =        "{MemScale}: active low-power modes for main memory",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "225--238",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950392",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Main memory is responsible for a large and increasing
                 fraction of the energy consumed by servers. Prior work
                 has focused on exploiting DRAM low-power states to
                 conserve energy. However, these states require entire
                 DRAM ranks to be idled, which is difficult to achieve
                 even in lightly loaded servers. In this paper, we
                 propose to conserve memory energy while improving its
                 energy-proportionality by creating active low-power
                 modes for it. Specifically, we propose MemScale, a
                 scheme wherein we apply dynamic voltage and frequency
                 scaling (DVFS) to the memory controller and dynamic
                 frequency scaling (DFS) to the memory channels and DRAM
                 devices. MemScale is guided by an operating system
                 policy that determines the DVFS/DFS mode of the memory
                 subsystem based on the current need for memory
                 bandwidth, the potential energy savings, and the
                 performance degradation that applications are willing
                 to withstand. Our results demonstrate that MemScale
                 reduces energy consumption significantly compared to
                 modern memory energy management approaches. We conclude
                 that the potential benefits of the MemScale mechanisms
                 and policy more than compensate for their small
                 hardware cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Gao:2012:TMH,
  author =       "Qi Gao and Wenbin Zhang and Zhezhe Chen and Mai Zheng
                 and Feng Qin",
  title =        "{2ndStrike}: toward manifesting hidden concurrency
                 typestate bugs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "239--250",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950394",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrency bugs are becoming increasingly prevalent
                 in the multi-core era. Recently, much research has
                 focused on data races and atomicity violation bugs,
                 which are related to low-level memory accesses.
                 However, a large number of concurrency typestate bugs
                 such as ``invalid reads to a closed file from a
                 different thread'' are under-studied. These concurrency
                 typestate bugs are important yet challenging to study
                 since they are mostly relevant to high-level program
                 semantics. This paper presents 2ndStrike, a method to
                 manifest hidden concurrency typestate bugs in software
                 testing. Given a state machine describing correct
                 program behavior on certain object typestates,
                 2ndStrike profiles runtime events related to the
                 typestates and thread synchronization. Based on the
                 profiling results, 2ndStrike then identifies bug
                 candidates, each of which is a pair of runtime events
                 that would cause typestate violation if the event order
                 is reversed. Finally, 2ndStrike re-executes the program
                 with controlled thread interleaving to manifest bug
                 candidates. We have implemented a prototype of
                 2ndStrike on Linux and have illustrated our idea using
                 three types of concurrency typestate bugs, including
                 invalid file operation, invalid pointer dereference,
                 and invalid lock operation. We have evaluated 2ndStrike
                 with six real world bugs (including one previously
                 unknown bug) from three open-source server and desktop
                 programs (i.e., MySQL, Mozilla, pbzip2). Our
                 experimental results show that 2ndStrike can
                 effectively and efficiently manifest all six software
                 bugs, most of which are difficult or impossible to
                 manifest using stress testing or active testing
                 techniques that are based on data race/atomicity
                 violation. Additionally, 2ndStrike reports no false
                 positives, provides detailed bug reports for each
                 manifested bug, and can consistently reproduce the bug
                 after manifesting it once.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Zhang:2012:CDC,
  author =       "Wei Zhang and Junghee Lim and Ramya Olichandran and
                 Joel Scherpelz and Guoliang Jin and Shan Lu and Thomas
                 Reps",
  title =        "{ConSeq}: detecting concurrency bugs through
                 sequential errors",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "251--264",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950395",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrency bugs are caused by non-deterministic
                 interleavings between shared memory accesses. Their
                 effects propagate through data and control dependences
                 until they cause software to crash, hang, produce
                 incorrect output, etc. The lifecycle of a bug thus
                 consists of three phases: (1) triggering, (2)
                 propagation, and (3) failure. Traditional techniques
                 for detecting concurrency bugs mostly focus on phase
                 (1)--i.e., on finding certain structural patterns of
                 interleavings that are common triggers of concurrency
                 bugs, such as data races. This paper explores a
                 consequence-oriented approach to improving the accuracy
                 and coverage of state-space search and bug detection.
                 The proposed approach first statically identifies
                 potential failure sites in a program binary (i.e., it
                 first considers a phase (3) issue). It then uses static
                 slicing to identify critical read instructions that are
                 highly likely to affect potential failure sites through
                 control and data dependences (phase (2)). Finally, it
                 monitors a single (correct) execution of a concurrent
                 program and identifies suspicious interleavings that
                 could cause an incorrect state to arise at a critical
                 read and then lead to a software failure (phase (1)).
                 ConSeq's backwards approach, (3)!(2)!(1), provides
                 advantages in bug-detection coverage and accuracy but
                 is challenging to carry out. ConSeq makes it feasible
                 by exploiting the empirical observation that phases (2)
                 and (3) usually are short and occur within one thread.
                 Our evaluation on large, real-world C/C++ applications
                 shows that ConSeq detects more bugs than traditional
                 approaches and has a much lower false-positive rate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Chipounov:2012:SPV,
  author =       "Vitaly Chipounov and Volodymyr Kuznetsov and George
                 Candea",
  title =        "{S2E}: a platform for in-vivo multi-path analysis of
                 software systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "265--278",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950396",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents S2E, a platform for analyzing the
                 properties and behavior of software systems. We
                 demonstrate S2E's use in developing practical tools for
                 comprehensive performance profiling, reverse
                 engineering of proprietary software, and bug finding
                 for both kernel-mode and user-mode binaries. Building
                 these tools on top of S2E took less than 770 LOC and 40
                 person-hours each. S2E's novelty consists of its
                 ability to scale to large real systems, such as a full
                 Windows stack. S2E is based on two new ideas: selective
                 symbolic execution, a way to automatically minimize the
                 amount of code that has to be executed symbolically
                 given a target analysis, and relaxed execution
                 consistency models, a way to make principled
                 performance/accuracy trade-offs in complex analyses.
                 These techniques give S2E three key abilities: to
                 simultaneously analyze entire families of execution
                 paths, instead of just one execution at a time; to
                 perform the analyses in-vivo within a real software
                 stack--user programs, libraries, kernel, drivers,
                 etc.--instead of using abstract models of these layers;
                 and to operate directly on binaries, thus being able to
                 analyze even proprietary software. Conceptually, S2E is
                 an automated path explorer with modular path analyzers:
                 the explorer drives the target system down all
                 execution paths of interest, while analyzers check
                 properties of each such path (e.g., to look for bugs)
                 or simply collect information (e.g., count page
                 faults). Desired paths can be specified in multiple
                 ways, and S2E users can either combine existing
                 analyzers to build a custom analysis tool, or write new
                 analyzers using the S2E API.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Hofmann:2012:EOS,
  author =       "Owen S. Hofmann and Alan M. Dunn and Sangman Kim and
                 Indrajit Roy and Emmett Witchel",
  title =        "Ensuring operating system kernel integrity with
                 {OSck}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "279--290",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950398",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Kernel rootkits that modify operating system state to
                 avoid detection are a dangerous threat to system
                 security. This paper presents OSck, a system that
                 discovers kernel rootkits by detecting malicious
                 modifications to operating system data. OSck integrates
                 and extends existing techniques for detecting rootkits,
                 and verifies safety properties for large portions of
                 the kernel heap with minimal overhead. We deduce type
                 information for verification by analyzing unmodified
                 kernel source code and in-memory kernel data
                 structures. High-performance integrity checks that
                 execute concurrently with a running operating system
                 create data races, and we demonstrate a deterministic
                 solution for ensuring kernel memory is in a consistent
                 state. We introduce two new classes of kernel rootkits
                 that are undetectable by current systems, motivating
                 the need for the OSck API that allows kernel developers
                 to conveniently specify arbitrary integrity
                 properties.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Porter:2012:RLT,
  author =       "Donald E. Porter and Silas Boyd-Wickizer and Jon
                 Howell and Reuben Olinsky and Galen C. Hunt",
  title =        "Rethinking the library {OS} from the top down",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "291--304",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950399",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper revisits an old approach to operating
                 system construction, the library OS, in a new context.
                 The idea of the library OS is that the personality of
                 the OS on which an application depends runs in the
                 address space of the application. A small, fixed set of
                 abstractions connects the library OS to the host OS
                 kernel, offering the promise of better system security
                 and more rapid independent evolution of OS components.
                 We describe a working prototype of a Windows 7 library
                 OS that runs the latest releases of major applications
                 such as Microsoft Excel, PowerPoint, and Internet
                 Explorer. We demonstrate that desktop sharing across
                 independent, securely isolated, library OS instances
                 can be achieved through the pragmatic reuse of
                 net-working protocols. Each instance has significantly
                 lower overhead than a full VM bundled with an
                 application: a typical application adds just 16MB of
                 working set and 64MB of disk footprint. We contribute a
                 new ABI below the library OS that enables application
                 mobility. We also show that our library OS can address
                 many of the current uses of hardware virtual machines
                 at a fraction of the overheads. This paper describes
                 the first working prototype of a full commercial OS
                 redesigned as a library OS capable of running
                 significant applications. Our experience shows that the
                 long-promised benefits of the library OS approach
                 better protection of system integrity and rapid system
                 evolution are readily obtainable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Palix:2012:FLT,
  author =       "Nicolas Palix and Ga{\"e}l Thomas and Suman Saha and
                 Christophe Calv{\`e}s and Julia Lawall and Gilles
                 Muller",
  title =        "Faults in {Linux}: ten years later",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "305--318",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950401",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In 2001, Chou et al. published a study of faults found
                 by applying a static analyzer to Linux versions 1.0
                 through 2.4.1. A major result of their work was that
                 the drivers directory contained up to 7 times more of
                 certain kinds of faults than other directories. This
                 result inspired a number of development and research
                 efforts on improving the reliability of driver code.
                 Today Linux is used in a much wider range of
                 environments, provides a much wider range of services,
                 and has adopted a new development and release model.
                 What has been the impact of these changes on code
                 quality? Are drivers still a major problem? To answer
                 these questions, we have transported the experiments of
                 Chou et al. to Linux versions 2.6.0 to 2.6.33, released
                 between late 2003 and early 2010. We find that Linux
                 has more than doubled in size during this period, but
                 that the number of faults per line of code has been
                 decreasing. And, even though drivers still accounts for
                 a large part of the kernel code and contains the most
                 faults, its fault rate is now below that of other
                 directories, such as arch (HAL) and fs (file systems).
                 These results can guide further development and
                 research efforts. To enable others to continually
                 update these results as Linux evolves, we define our
                 experimental protocol and make our checkers and results
                 available in a public archive.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Esmaeilzadeh:2012:LBL,
  author =       "Hadi Esmaeilzadeh and Ting Cao and Yang Xi and Stephen
                 M. Blackburn and Kathryn S. McKinley",
  title =        "Looking back on the language and hardware revolutions:
                 measured power, performance, and scaling",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "319--332",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950402",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper reports and analyzes measured chip power
                 and performance on five process technology generations
                 executing 61 diverse benchmarks with a rigorous
                 methodology. We measure representative Intel IA32
                 processors with technologies ranging from 130nm to 32nm
                 while they execute sequential and parallel benchmarks
                 written in native and managed languages. During this
                 period, hardware and software changed substantially:
                 (1) hardware vendors delivered chip multiprocessors
                 instead of uniprocessors, and independently (2)
                 software developers increasingly chose managed
                 languages instead of native languages. This
                 quantitative data reveals the extent of some known and
                 previously unobserved hardware and software trends. Two
                 themes emerge. (I) Workload: The power, performance,
                 and energy trends of native workloads do not
                 approximate managed workloads. For example, (a) the
                 SPEC CPU2006 native benchmarks on the i7 (45) and i5
                 (32) draw significantly less power than managed or
                 scalable native benchmarks; and (b) managed runtimes
                 exploit parallelism even when running single-threaded
                 applications. The results recommend architects always
                 include native and managed workloads when designing and
                 evaluating energy efficient hardware. (II)
                 Architecture: Clock scaling, microarchitecture,
                 simultaneous multithreading, and chip multiprocessors
                 each elicit a huge variety of power, performance, and
                 energy responses. This variety and the difficulty of
                 obtaining power measurements recommends exposing
                 on-chip power meters and when possible structure
                 specific power meters for cores, caches, and other
                 structures. Just as hardware event counters provide a
                 quantitative grounding for performance innovations,
                 power meters are necessary for optimizing energy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Nguyen:2012:SCS,
  author =       "Donald Nguyen and Keshav Pingali",
  title =        "Synthesizing concurrent schedulers for irregular
                 algorithms",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "333--344",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950404",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scheduling is the assignment of tasks or activities to
                 processors for execution, and it is an important
                 concern in parallel programming. Most prior work on
                 scheduling has focused either on static scheduling of
                 applications in which the dependence graph is known at
                 compile-time or on dynamic scheduling of independent
                 loop iterations such as in OpenMP. In irregular
                 algorithms, dependences between activities are complex
                 functions of runtime values so these algorithms are not
                 amenable to compile-time analysis nor do they consist
                 of independent activities. Moreover, the amount of work
                 can vary dramatically with the scheduling policy. To
                 handle these complexities, implementations of irregular
                 algorithms employ carefully handcrafted,
                 algorithm-specific schedulers but these schedulers are
                 themselves parallel programs, complicating the parallel
                 programming problem further. In this paper, we present
                 a flexible and efficient approach for specifying and
                 synthesizing scheduling policies for irregular
                 algorithms. We develop a simple compositional
                 specification language and show how it can concisely
                 encode scheduling policies in the literature. Then, we
                 show how to synthesize efficient parallel schedulers
                 from these specifications. We evaluate our approach for
                 five irregular algorithms on three multicore
                 architectures and show that (1) the performance of some
                 algorithms can improve by orders of magnitude with the
                 right scheduling policy, and (2) for the same policy,
                 the overheads of our synthesized schedulers are
                 comparable to those of fixed-function schedulers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Hoang:2012:ECT,
  author =       "Giang Hoang and Robby Bruce Findler and Russ Joseph",
  title =        "Exploring circuit timing-aware language and
                 compilation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "345--356",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950405",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "By adjusting the design of the ISA and enabling
                 circuit timing-sensitive optimizations in a compiler,
                 we can more effectively exploit timing speculation.
                 While there has been growing interest in systems that
                 leverage circuit-level timing speculation to improve
                 the performance and power-efficiency of processors,
                 most of the innovation has been at the
                 microarchitectural level. We make the observation that
                 some code sequences place greater demand on circuit
                 timing deadlines than others. Furthermore, by
                 selectively replacing these codes with instruction
                 sequences which are semantically equivalent but reduce
                 activity on timing critical circuit paths, we can
                 trigger fewer timing errors and hence reduce recovery
                 costs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Farhad:2012:OAM,
  author =       "Sardar M. Farhad and Yousun Ko and Bernd Burgstaller
                 and Bernhard Scholz",
  title =        "Orchestration by approximation: mapping stream
                 programs onto multicore architectures",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "357--368",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950406",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a novel 2-approximation algorithm for
                 deploying stream graphs on multicore computers and a
                 stream graph transformation that eliminates
                 bottlenecks. The key technical insight is a data rate
                 transfer model that enables the computation of a
                 ``closed form'', i.e., the data rate transfer function
                 of an actor depending on the arrival rate of the stream
                 program. A combinatorial optimization problem uses the
                 closed form to maximize the throughput of the stream
                 program. Although the problem is inherently NP-hard, we
                 present an efficient and effective 2-approximation
                 algorithm that provides a lower bound on the quality of
                 the solution. We introduce a transformation that uses
                 the closed form to identify and eliminate bottlenecks.
                 We show experimentally that state-of-the art integer
                 linear programming approaches for orchestrating stream
                 graphs are (1) intractable or at least impractical for
                 larger stream graphs and larger number of processors
                 and (2) our 2-approximation algorithm is highly
                 efficient and its results are close to the optimal
                 solution for a standard set of StreamIt benchmark
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Zhang:2012:FED,
  author =       "Eddy Z. Zhang and Yunlian Jiang and Ziyu Guo and Kai
                 Tian and Xipeng Shen",
  title =        "On-the-fly elimination of dynamic irregularities for
                 {GPU} computing",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "369--380",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950408",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The power-efficient massively parallel Graphics
                 Processing Units (GPUs) have become increasingly
                 influential for general-purpose computing over the past
                 few years. However, their efficiency is sensitive to
                 dynamic irregular memory references and control flows
                 in an application. Experiments have shown great
                 performance gains when these irregularities are
                 removed. But it remains an open question how to achieve
                 those gains through software approaches on modern GPUs.
                 This paper presents a systematic exploration to tackle
                 dynamic irregularities in both control flows and memory
                 references. It reveals some properties of dynamic
                 irregularities in both control flows and memory
                 references, their interactions, and their relations
                 with program data and threads. It describes several
                 heuristics-based algorithms and runtime adaptation
                 techniques for effectively removing dynamic
                 irregularities through data reordering and job
                 swapping. It presents a framework, G-Streamline, as a
                 unified software solution to dynamic irregularities in
                 GPU computing. G-Streamline has several distinctive
                 properties. It is a pure software solution and works on
                 the fly, requiring no hardware extensions or offline
                 profiling. It treats both types of irregularities at
                 the same time in a holistic fashion, maximizing the
                 whole-program performance by resolving conflicts among
                 optimizations. Its optimization overhead is largely
                 transparent to GPU kernel executions, jeopardizing no
                 basic efficiency of the GPU application. Finally, it is
                 robust to the presence of various complexities in GPU
                 applications. Experiments show that G-Streamline is
                 effective in reducing dynamic irregularities in GPU
                 computing, producing speedups between 1.07 and 2.5 for
                 a variety of applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Hormati:2012:SPS,
  author =       "Amir H. Hormati and Mehrzad Samadi and Mark Woh and
                 Trevor Mudge and Scott Mahlke",
  title =        "{Sponge}: portable stream programming on graphics
                 engines",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "381--392",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950409",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphics processing units (GPUs) provide a low cost
                 platform for accelerating high performance
                 computations. The introduction of new programming
                 languages, such as CUDA and OpenCL, makes GPU
                 programming attractive to a wide variety of
                 programmers. However, programming GPUs is still a
                 cumbersome task for two primary reasons: tedious
                 performance optimizations and lack of portability.
                 First, optimizing an algorithm for a specific GPU is a
                 time-consuming task that requires a thorough
                 understanding of both the algorithm and the underlying
                 hardware. Unoptimized CUDA programs typically only
                 achieve a small fraction of the peak GPU performance.
                 Second, GPU code lacks efficient portability as code
                 written for one GPU can be inefficient when executed on
                 another. Moving code from one GPU to another while
                 maintaining the desired performance is a non-trivial
                 task often requiring significant modifications to
                 account for the hardware differences. In this work, we
                 propose Sponge, a compilation framework for GPUs using
                 synchronous data flow streaming languages. Sponge is
                 capable of performing a wide variety of optimizations
                 to generate efficient code for graphics engines. Sponge
                 alleviates the problems associated with current GPU
                 programming methods by providing portability across
                 different generations of GPUs and CPUs, and a better
                 abstraction of the hardware details, such as the memory
                 hierarchy and threading model. Using streaming, we
                 provide a write-once software paradigm and rely on the
                 compiler to automatically create optimized CUDA code
                 for a wide variety of GPU targets. Sponge's compiler
                 optimizations improve the performance of the baseline
                 CUDA implementations by an average of 3.2x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Kamruzzaman:2012:ICP,
  author =       "Md Kamruzzaman and Steven Swanson and Dean M.
                 Tullsen",
  title =        "Inter-core prefetching for multicore processors using
                 migrating helper threads",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "393--404",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950411",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multicore processors have become ubiquitous in today's
                 systems, but exploiting the parallelism they offer
                 remains difficult, especially for legacy application
                 and applications with large serial components. The
                 challenge, then, is to develop techniques that allow
                 multiple cores to work in concert to accelerate a
                 single thread. This paper describes inter-core
                 prefetching, a technique to exploit multiple cores to
                 accelerate a single thread. Inter-core prefetching
                 extends existing work on helper threads for SMT
                 machines to multicore machines. Inter-core prefetching
                 uses one compute thread and one or more prefetching
                 threads. The prefetching threads execute on cores that
                 would otherwise be idle, prefetching the data that the
                 compute thread will need. The compute thread then
                 migrates between cores, following the path of the
                 prefetch threads, and finds the data already waiting
                 for it. Inter-core prefetching works with existing
                 hardware and existing instruction set architectures.
                 Using a range of state-of-the-art multiprocessors, this
                 paper characterizes the potential benefits of the
                 technique with microbenchmarks and then measures its
                 impact on a range of memory intensive applications. The
                 results show that inter-core prefetching improves
                 performance by an average of 31 to 63\%, depending on
                 the architecture, and speeds up some applications by as
                 much as 2.8$ \times $. It also demonstrates that
                 inter-core prefetching reduces energy consumption by
                 between 11 and 26\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Hayashizaki:2012:IPT,
  author =       "Hiroshige Hayashizaki and Peng Wu and Hiroshi Inoue
                 and Mauricio J. Serrano and Toshio Nakatani",
  title =        "Improving the performance of trace-based systems by
                 false loop filtering",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "4",
  pages =        "405--418",
  month =        apr,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2248487.1950412",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 7 08:15:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Trace-based compilation is a promising technique for
                 language compilers and binary translators. It offers
                 the potential to expand the compilation scopes that
                 have traditionally been limited by method boundaries.
                 Detecting repeating cyclic execution paths and
                 capturing the detected repetitions into traces is a key
                 requirement for trace selection algorithms to achieve
                 good optimization and performance with small amounts of
                 code. One important class of repetition detection is
                 cyclic-path-based repetition detection, where a cyclic
                 execution path (a path that starts and ends at the same
                 instruction address) is detected as a repeating cyclic
                 execution path. However, we found many cyclic paths
                 that are not repeating cyclic execution paths, which we
                 call false loops. A common class of false loops occurs
                 when a method is invoked from multiple call-sites. A
                 cycle is formed between two invocations of the method
                 from different call-sites, but which does not represent
                 loops or recursion. False loops can result in shorter
                 traces and smaller compilation scopes, and degrade the
                 performance. We propose false loop filtering, an
                 approach to reject false loops in the repetition
                 detection step of trace selection, and a technique
                 called false loop filtering by call-stack-comparison,
                 which rejects a cyclic path as a false loop if the call
                 stacks at the beginning and the end of the cycle are
                 different. We applied false loop filtering to our
                 trace-based Java\TM{} JIT compiler that is based on
                 IBM's J9 JVM. We found that false loop filtering
                 achieved an average improvement of 16\% and 10\% for
                 the DaCapo benchmark when applied to two baseline trace
                 selection algorithms, respectively, with up to 37\%
                 improvement for individual benchmarks. In the end, with
                 false loop filtering, our trace-based JIT achieves a
                 performance comparable to that of the method-based J9
                 JVM/JIT using the corresponding optimization level.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '12 conference proceedings.",
}

@Article{Xue:2012:RJC,
  author =       "Jingling Xue",
  title =        "Rethinking {Java} call stack design for tiny embedded
                 devices",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "1--10",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248420",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "The ability of tiny embedded devices to run large
                 feature-rich programs is typically constrained by the
                 amount of memory installed on such devices.
                 Furthermore, the useful operation of these devices in
                 wireless sensor applications is limited by their
                 battery life. This paper presents a call stack redesign
                 targeted at an efficient use of RAM storage and CPU
                 cycles by a Java program running on a wireless sensor
                 mote. Without compromising the application programs,
                 our call stack redesign saves 30\% of RAM, on average,
                 evaluated over a large number of benchmarks. On the
                 same set of bench-marks, our design also avoids
                 frequent RAM allocations and deallocations, resulting
                 in average 80\% fewer memory operations and 23\% faster
                 program execution. These may be critical improvements
                 for tiny embedded devices that are equipped with small
                 amount of RAM and limited battery life. However, our
                 call stack redesign is equally effective for any
                 complex multi-threaded object oriented program
                 developed for desktop computers. We describe the
                 redesign, measure its performance and report the
                 resulting savings in RAM and execution time for a wide
                 variety of programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sallenave:2012:LGE,
  author =       "Olivier Sallenave and Roland Ducournau",
  title =        "Lightweight generics in embedded systems through
                 static analysis",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "11--20",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248421",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Low-end embedded systems are still programmed in C and
                 assembly, and adopting high-level languages such as C\#
                 should reduce the length of their development cycles.
                 For these systems, code size is a major concern, but
                 run-time efficiency should also be reasonable ---
                 programmers will not migrate to C\# unless the overhead
                 compared with C is insignificant. In this paper, we
                 propose a static approach based on whole program
                 optimization for implementing {.NET} generics in such
                 systems. Indeed, the implementation of run-time
                 generics involves a tradeoff between size and run-time
                 efficiency. In this proposal, generic instances are
                 detected through a generalization of RTA to parametric
                 polymorphism. Also, we propose an implementation scheme
                 which employs code sharing and more effective coercions
                 than boxing. Unlike existing implementation schemes, it
                 is scalable in the number of generic instances without
                 involving boxing and unboxing in a systematic way.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kyle:2012:EPI,
  author =       "Stephen Kyle and Igor B{\"o}hm and Bj{\"o}rn Franke
                 and Hugh Leather and Nigel Topham",
  title =        "Efficiently parallelizing instruction set simulation
                 of embedded multi-core processors using region-based
                 just-in-time dynamic binary translation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "21--30",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248422",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Embedded systems, as typified by modern mobile phones,
                 are already seeing a drive toward using multi-core
                 processors. The number of cores will likely increase
                 rapidly in the future. Engineers and researchers need
                 to be able to simulate systems, as they are expected to
                 be in a few generations time, running simulations of
                 many-core devices on today's multi-core machines. These
                 requirements place heavy demands on the scalability of
                 simulation engines, the fastest of which have typically
                 evolved from just-in-time (Jit) dynamic binary
                 translators (Dbt). Existing work aimed at parallelizing
                 Dbt simulators has focused exclusively on trace-based
                 Dbt, wherein linear execution traces or perhaps trees
                 thereof are the units of translation. Region-based Dbt
                 simulators have not received the same attention and
                 require different techniques than their trace-based
                 cousins. In this paper we develop an innovative
                 approach to scaling multi-core, embedded simulation
                 through region-based Dbt. We initially modify the Jit
                 code generator of such a simulator to emit code that
                 does not depend on a particular thread with its
                 thread-specific context and is, therefore,
                 thread-agnostic. We then demonstrate that this
                 thread-agnostic code generation is comparable to
                 thread-specific code with respect to performance, but
                 also enables the sharing of JIT-compiled regions
                 between different threads. This sharing optimisation,
                 in turn, leads to significant performance improvements
                 for multi-threaded applications. In fact, our results
                 confirm that an average of 76\% of all JIT-compiled
                 regions can be shared between 128 threads in
                 representative, parallel workloads. We demonstrate that
                 this translates into an overall performance improvement
                 by 1.44x on average and up to 2.40x across 12
                 multi-threaded benchmarks taken from the Splash-2
                 benchmark suite, targeting our high-performance
                 multi-core Dbt simulator for embedded Arc processors
                 running on a 4-core Intel host machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Huang:2012:WAR,
  author =       "Yazhi Huang and Mengying Zhao and Chun Jason Xue",
  title =        "{WCET}-aware re-scheduling register allocation for
                 real-time embedded systems with clustered {VLIW}
                 architecture",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "31--40",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248424",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Worst-Case Execution Time (WCET) is one of the most
                 important metrics in real-time embedded system design.
                 For embedded systems with clustered VLIW architecture,
                 register allocation, instruction scheduling, and
                 cluster assignment are three key activities to pursue
                 code optimization which have profound impact on WCET.
                 At the same time, these three activities exhibit a
                 phase ordering problem: Independently performing
                 register allocation, scheduling and cluster assignment
                 could have a negative effect on the other phases,
                 thereby generating sub-optimal compiled codes. In this
                 paper, a compiler level optimization, namely WCET-aware
                 Re-scheduling Register Allocation (WRRA), is proposed
                 to achieve WCET minimization for real-time embedded
                 systems with clustered VLIW architecture. The novelty
                 of the proposed approach is that the effects of
                 register allocation, instruction scheduling and cluster
                 assignment on the quality of generated code are taken
                 into account for WCET minimization. These three
                 compilation processes are integrated into a single
                 phase to obtain a balanced result. The proposed
                 technique is implemented in Trimaran 4.0. The
                 experimental results show that the proposed technique
                 can reduce WCET effectively, by 33\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wan:2012:WAD,
  author =       "Qing Wan and Hui Wu and Jingling Xue",
  title =        "{WCET}-aware data selection and allocation for
                 scratchpad memory",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "41--50",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248425",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "In embedded systems, SPM (scratchpad memory) is an
                 attractive alternative to cache memory due to its lower
                 energy consumption and higher predictability of program
                 execution. This paper studies the problem of placing
                 variables of a program into an SPM such that its WCET
                 (worst-case execution time) is minimized. We propose an
                 efficient dynamic approach that comprises two novel
                 heuristics. The first heuristic iteratively selects a
                 most beneficial variable as an SPM resident candidate
                 based on its impact on the k longest paths of the
                 program. The second heuristic incrementally allocates
                 each SPM resident candidate to the SPM based on graph
                 coloring and acyclic graph orientation. We have
                 evaluated our approach by comparing with an ILP-based
                 approach and a longest-path-based greedy approach using
                 the eight benchmarks selected from Powerstone and
                 M{\"a}lardalen WCET Benchmark suites under three
                 different SPM configurations. Our approach achieves up
                 to 21\% and 43\% improvements in WCET reduction over
                 the ILP-based approach and the greedy approach,
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gerard:2012:MMO,
  author =       "L{\'e}onard G{\'e}rard and Adrien Guatto and
                 C{\'e}dric Pasteur and Marc Pouzet",
  title =        "A modular memory optimization for synchronous
                 data-flow languages: application to arrays in a
                 {Lustre} compiler",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "51--60",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248426",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "The generation of efficient sequential code for
                 synchronous data-flow languages raises two intertwined
                 issues: control and memory optimization. While the
                 former has been extensively studied, for instance in
                 the compilation of Lustre and Signal, the latter has
                 only been addressed in a restricted manner. Yet, memory
                 optimization becomes a pressing issue when arrays are
                 added to such languages. This article presents a
                 two-level solution to the memory optimization problem.
                 It combines a compile-time optimization algorithm,
                 reminiscent of register allocation, paired with
                 language annotations on the source given by the
                 designer. Annotations express in-place modifications
                 and control where allocation is performed. Moreover,
                 they allow external functions performing in-place
                 modifications to be safely imported. Soundness of
                 annotations is guaranteed by a semilinear type system
                 and additional scheduling constraints. A key feature is
                 that annotations for well-typed programs do not change
                 the semantics of the language: removing them may lead
                 to less efficient code but will not alter the
                 semantics. The method has been implemented in a new
                 compiler for a LUSTRE-like synchronous language
                 extended with hierarchical automata and arrays.
                 Experiments show that the proposed approach removes
                 most of the unnecessary array copies, resulting in
                 faster code that uses less memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sbirlea:2012:MDF,
  author =       "Alina Sb{\^\i}rlea and Yi Zou and Zoran Budiml{\'\i}c
                 and Jason Cong and Vivek Sarkar",
  title =        "Mapping a data-flow programming model onto
                 heterogeneous platforms",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "61--70",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248428",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "In this paper we explore mapping of a high-level macro
                 data-flow programming model called Concurrent
                 Collections (CnC) onto heterogeneous platforms in order
                 to achieve high performance and low energy consumption
                 while preserving the ease of use of data-flow
                 programming. Modern computing platforms are becoming
                 increasingly heterogeneous in order to improve energy
                 efficiency. This trend is clearly seen across a diverse
                 spectrum of platforms, from small-scale embedded SOCs
                 to large-scale super-computers. However, programming
                 these heterogeneous platforms poses a serious challenge
                 for application developers. We have designed a software
                 flow for converting high-level CnC programs to the
                 Habanero-C language. CnC programs have a clear
                 separation between the application description, the
                 implementation of each of the application components
                 and the abstraction of hardware platform, making it an
                 excellent programming model for domain experts. Domain
                 experts can later employ the help of a tuning expert
                 (either a compiler or a person) to tune their
                 applications with minimal effort. We also extend the
                 Habanero-C runtime system to support work-stealing
                 across heterogeneous computing devices and introduce
                 task affinity for these heterogeneous components to
                 allow users to fine tune the runtime scheduling
                 decisions. We demonstrate a working example that maps a
                 pipeline of medical image-processing algorithms onto a
                 prototype heterogeneous platform that includes CPUs,
                 GPUs and FPGAs. For the medical imaging domain, where
                 obtaining fast and accurate results is a critical step
                 in diagnosis and treatment of patients, we show that
                 our model offers up to 17.72X speedup and an estimated
                 usage of 0.52X of the power used by CPUs alone, when
                 using accelerators (GPUs and FPGAs) and CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hashemi:2012:FSU,
  author =       "Matin Hashemi and Mohammad H. Foroozannejad and Soheil
                 Ghiasi and Christoph Etzel",
  title =        "{FORMLESS}: scalable utilization of embedded manycores
                 in streaming applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "71--78",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248429",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Variants of dataflow specification models are widely
                 used to synthesize streaming applications for
                 distributed-memory parallel processors. We argue that
                 current practice of specifying streaming applications
                 using rigid dataflow models, implicitly prohibits a
                 number of platform oriented optimizations and hence
                 limits portability and scalability with respect to
                 number of processors. We motivate
                 Functionally-cOnsistent stRucturally-MalLEabe Streaming
                 Specification, dubbed FORMLESS, which refers to raising
                 the abstraction level beyond fixed-structure dataflow
                 to address its portability and scalability limitations.
                 To demonstrate the potential of the idea, we develop a
                 design space exploration scheme to customize the
                 application specification to better fit the target
                 platform. Experiments with several common streaming
                 case studies demonstrate improved portability and
                 scalability over conventional dataflow specification
                 models, and confirm the effectiveness of our
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Farhad:2012:PGD,
  author =       "S. M. Farhad and Yousun Ko and Bernd Burgstaller and
                 Bernhard Scholz",
  title =        "Profile-guided deployment of stream programs on
                 multicores",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "79--88",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248430",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Because multicore architectures have become the
                 industry standard, programming abstractions for
                 concurrent programming are of key importance. Stream
                 programming languages facilitate application domains
                 characterized by regular sequences of data, such as
                 multimedia, graphics, signal processing and networking.
                 With stream programs, computations are expressed
                 through independent actors that interact through FIFO
                 data channels. A major challenge with stream programs
                 is to load-balance actors among available processing
                 cores. The workload of a stream program is determined
                 by actor execution times and the communication overhead
                 induced by data channels. Estimating communication
                 costs on cache-coherent shared-memory multiprocessors
                 is difficult, because data movements are abstracted
                 away by the cache coherence protocol. Standard
                 execution time profiling techniques cannot separate
                 actor execution times from communication costs, because
                 communication costs manifest in terms of execution time
                 overhead. In this work we present a unified Integer
                 Linear Programming (ILP) formulation that balances the
                 workload of stream programs on cache-coherent multicore
                 architectures. For estimating the communication costs
                 of data channels, we devise a novel profiling scheme
                 that minimizes the number of profiling steps. We
                 conduct experiments across a range of StreamIt
                 benchmarks and show that our method achieves a speedup
                 of up to 4.02x on 6 processors. The number of profiling
                 steps is on average only 17\% of an exhaustive
                 profiling run over all data channels of a stream
                 program.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fang:2012:IDP,
  author =       "Zhenman Fang and Jiaxin Li and Weihua Zhang and Yi Li
                 and Haibo Chen and Binyu Zang",
  title =        "Improving dynamic prediction accuracy through
                 multi-level phase analysis",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "89--98",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248432",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Phase analysis, which classifies the set of execution
                 intervals with similar execution behavior and resource
                 requirements, has been widely used in a variety of
                 dynamic systems, including dynamic cache
                 reconfiguration, prefetching and race detection. While
                 phase granularity has been a major factor to the
                 accuracy of phase prediction, it has not been well
                 investigated yet and most dynamic systems usually adopt
                 a fine-grained prediction scheme. However, such a
                 scheme can only take account of recent local phase
                 information and could be frequently interfered by
                 temporary noises due to instant phase changes, which
                 might notably limit the prediction accuracy. In this
                 paper, we make the first investigation on the potential
                 of multi-level phase analysis (MLPA), where different
                 granularity phase analysis are combined together to
                 improve the overall accuracy. The key observation is
                 that a coarse-grained interval, which usually consists
                 of stably-distributed fine-grained intervals, can be
                 accurately identified based on the fine-grained
                 intervals at the beginning of its execution. Based on
                 the observation, we design and implement a MLPA scheme.
                 In such a scheme, a coarse-grained phase is first
                 identified based on the fine-grained intervals at the
                 beginning of its execution. The following fine-grained
                 phases in it are then predicted based on the sequence
                 of fine-grained phases in the coarse-grained phase.
                 Experimental results show such a scheme can notably
                 improve the prediction accuracy. Using Markov
                 fine-grained phase predictor as the baseline, MLPA can
                 improve prediction accuracy by 20\%, 39\% and 29\% for
                 next phase, phase change and phase length prediction
                 for SPEC2000 accordingly, yet incur only about 2\% time
                 overhead and 40\% space overhead (about 360 bytes in
                 total). To demonstrate the effectiveness of MLPA, we
                 apply it to a dynamic cache reconfiguration system
                 which dynamically adjusts the cache size to reduce the
                 power consumption and access time of data cache.
                 Experimental results show that MLPA can further reduce
                 the average cache size by 15\% compared to the
                 fine-grained scheme.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Khudia:2012:ESE,
  author =       "Daya Shanker Khudia and Griffin Wright and Scott
                 Mahlke",
  title =        "Efficient soft error protection for commodity embedded
                 microprocessors using profile information",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "99--108",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248433",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Successive generations of processors use smaller
                 transistors in the quest to make more powerful
                 computing systems. It has been previously studied that
                 smaller transistors make processors more susceptible to
                 soft errors (transient faults caused by high energy
                 particle strikes). Such errors can result in unexpected
                 behavior and incorrect results. With smaller and
                 cheaper transistors becoming pervasive in mainstream
                 computing, it is necessary to protect these devices
                 against soft errors; an increasing rate of faults
                 necessitates the protection of applications running on
                 commodity processors against soft errors. The existing
                 methods of protecting against such faults generally
                 have high area or performance overheads and thus are
                 not directly applicable in the embedded design space.
                 In order to protect against soft errors, the detection
                 of these errors is a necessary first step so that a
                 recovery can be triggered. To solve the problem of
                 detecting soft errors cheaply, we propose a
                 profiling-based software-only application analysis and
                 transformation solution. The goal is to develop a low
                 cost solution which can be deployed for off-the-shelf
                 embedded processors. The solution works by
                 intelligently duplicating instructions that are likely
                 to affect the program output, and comparing results
                 between original and duplicated instructions. The
                 intelligence of our solution is garnered through the
                 use of control flow, memory dependence, and value
                 profiling to understand and exploit the common-case
                 behavior of applications. Our solution is able to
                 achieve 92\% fault coverage with a 20\% instruction
                 overhead. This represents a 41\% lower performance
                 overhead than the best prior approaches with
                 approximately the same fault coverage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2012:CAP,
  author =       "Qingan Li and Mengying Zhao and Chun Jason Xue and
                 Yanxiang He",
  title =        "Compiler-assisted preferred caching for embedded
                 systems with {STT--RAM} based hybrid cache",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "109--118",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248434",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "As technology scales down, energy consumption is
                 becoming a big problem for traditional SRAM-based cache
                 hierarchies. The emerging Spin-Torque Transfer RAM
                 (STT-RAM) is a promising replacement for large on-chip
                 cache due to its ultra low leakage power and high
                 storage density. However, write operations on STT-RAM
                 suffer from considerably higher energy consumption and
                 longer latency than SRAM. Hybrid cache consisting of
                 both SRAM and STT-RAM has been proposed recently for
                 both performance and energy efficiency. Most management
                 strategies for hybrid caches employ migration-based
                 techniques to dynamically move write-intensive data
                 from STT-RAM to SRAM. These techniques lead to extra
                 overheads. In this paper, we propose a
                 compiler-assisted approach, preferred caching, to
                 significantly reduce the migration overhead by giving
                 migration-intensive memory blocks the preference for
                 the SRAM part of the hybrid cache. Furthermore, a data
                 assignment technique is proposed to improve the
                 efficiency of preferred caching. The reduction of
                 migration overhead can in turn improve the performance
                 and energy efficiency of STT-RAM based hybrid cache.
                 The experimental results show that, with the proposed
                 techniques, on average, the number of migrations is
                 reduced by 21.3\%, the total latency is reduced by
                 8.0\% and the total dynamic energy is reduced by
                 10.8\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zuluaga:2012:SDS,
  author =       "Marcela Zuluaga and Andreas Krause and Peter Milder
                 and Markus P{\"u}schel",
  title =        "``Smart'' design space sampling to predict
                 {Pareto}-optimal solutions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "119--128",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248436",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "Many high-level synthesis tools offer degrees of
                 freedom in mapping high-level specifications to
                 Register-Transfer Level descriptions. These choices do
                 not affect the functional behavior but span a design
                 space of different cost-performance tradeoffs. In this
                 paper we present a novel machine learning-based
                 approach that efficiently determines the Pareto-optimal
                 designs while only sampling and synthesizing a fraction
                 of the design space. The approach combines three key
                 components: (1) A regression model based on Gaussian
                 processes to predict area and throughput based on
                 synthesis training data. (2) A ``smart'' sampling
                 strategy, GP-PUCB, to iteratively refine the model by
                 carefully selecting the next design to synthesize to
                 maximize progress. (3) A stopping criterion based on
                 assessing the accuracy of the model without access to
                 complete synthesis data. We demonstrate the
                 effectiveness of our approach using IP generators for
                 discrete Fourier transforms and sorting networks.
                 However, our algorithm is not specific to this
                 application and can be applied to a wide range of
                 Pareto front prediction problems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bouissou:2012:OSS,
  author =       "Olivier Bouissou and Alexandre Chapoutot",
  title =        "An operational semantics for {Simulink}'s simulation
                 engine",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "129--138",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248437",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "The industrial tool Matlab/Simulink is widely used in
                 the design of embedded systems. The main feature of
                 this tool is its ability to model in a common formalism
                 the software and its physical environment. This makes
                 it very useful for validating the design of embedded
                 software using numerical simulation. However, the
                 formal verification of such models is still problematic
                 as Simulink is a programming language for which no
                 formal semantics exists. In this article, we present an
                 operational semantics of a representative subset of
                 Simulink which includes both continuous-time and
                 discrete-time blocks. We believe that this work gives a
                 better understanding of Simulink and it defines the
                 foundations of a general framework to apply formal
                 methods on Simulink's high level descriptions of
                 embedded systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yu:2012:SCC,
  author =       "Fang Yu and Shun-Ching Yang and Farn Wang and
                 Guan-Cheng Chen and Che-Chang Chan",
  title =        "Symbolic consistency checking of {OpenMP} parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "139--148",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248438",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "We present a symbolic approach for checking
                 consistency of OpenMP parallel programs. A parallel
                 program is consistent if it yields the same result as
                 its sequential version despite the execution order
                 among threads. We find race conditions of an OpenMP
                 parallel program, construct the formal model of its
                 raced segments under relaxed memory models, and perform
                 guided symbolic simulation to search consistency
                 violations. The simulation terminates when (1) a
                 witness has been found (the program is inconsistent),
                 or (2) all reachable states have been explored (the
                 program is consistent). We have developed the tool
                 Pathg by incorporating Omega library to solve race
                 constraints and Red symbolic simulator to perform
                 guided search. We show that Pathg can prove consistency
                 of programs, identify races that modern OpenMP checkers
                 failed to report, and find inconsistency witnesses
                 effectively against benchmarks from the OpenMP Source
                 Code Repository and the NAS Parallel benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gal-On:2012:CPR,
  author =       "Shay Gal-On and Markus Levy",
  title =        "Creating portable, repeatable, realistic benchmarks
                 for embedded systems and the challenges thereof",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "5",
  pages =        "149--152",
  month =        may,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345141.2248440",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:46 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "LCTES '12 proceedings.",
  abstract =     "To appreciate the challenges of analysing embedded
                 processor behaviour, step back in time to understand
                 the evolution of embedded processors. Only a few
                 decades ago, embedded processors were relatively simple
                 devices (compared to today), represented by a host of
                 8- and 16-bit microcontrollers, and 32-bit
                 microprocessors, with minimal integration. Today, these
                 processors (even the so-called, low-end
                 microcontrollers), have evolved into highly-integrated
                 SoCs with a wide variety of architectures capable of
                 tackling both specific and general-purpose tasks.
                 Associated with these transformations, the benchmarks
                 used to quantify the capabilities have also grown in
                 complexity and range. At the simplest level, benchmarks
                 such as CoreMark analyse the fundamental processor
                 cores. At the other end of the spectrum, system
                 benchmarks, such BrowsingBench, analyse the entire SoC
                 as well as the system software stack and even the
                 physical interfaces. This paper examines some of the
                 challenges of applying such benchmarks, and explains
                 the methodologies used at EEMBC to manage portability,
                 repeatability, and realism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hosking:2012:CHL,
  author =       "Tony Hosking",
  title =        "Compiling a high-level language for {GPUs}: (via
                 language support for architectures and compilers)",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "1--12",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254066",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Languages such as OpenCL and CUDA offer a standard
                 interface for general-purpose programming of GPUs.
                 However, with these languages, programmers must
                 explicitly manage numerous low-level details involving
                 communication and synchronization. This burden makes
                 programming GPUs difficult and error-prone, rendering
                 these powerful devices inaccessible to most
                 programmers. We desire a higher-level programming model
                 that makes GPUs more accessible while also effectively
                 exploiting their computational power. This paper
                 presents features of Lime, a new Java-compatible
                 language targeting heterogeneous systems, that allow an
                 optimizing compiler to generate high quality GPU code.
                 The key insight is that the language type system
                 enforces isolation and immutability invariants that
                 allow the compiler to optimize for a GPU without heroic
                 compiler analysis. Our compiler attains GPU speedups
                 between 75\% and 140\% of the performance of native
                 OpenCL code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Samadi:2012:AIA,
  author =       "Mehrzad Samadi and Amir Hormati and Mojtaba Mehrara
                 and Janghaeng Lee and Scott Mahlke",
  title =        "Adaptive input-aware compilation for graphics
                 engines",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "13--22",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254067",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "While graphics processing units (GPUs) provide
                 low-cost and efficient platforms for accelerating high
                 performance computations, the tedious process of
                 performance tuning required to optimize applications is
                 an obstacle to wider adoption of GPUs. In addition to
                 the programmability challenges posed by GPU's complex
                 memory hierarchy and parallelism model, a well-known
                 application design problem is target portability across
                 different GPUs. However, even for a single GPU target,
                 changing a program's input characteristics can make an
                 already-optimized implementation of a program perform
                 poorly. In this work, we propose Adaptic, an adaptive
                 input-aware compilation system to tackle this
                 important, yet overlooked, input portability problem.
                 Using this system, programmers develop their
                 applications in a high-level streaming language and let
                 Adaptic undertake the difficult task of input portable
                 optimizations and code generation. Several input-aware
                 optimizations are introduced to make efficient use of
                 the memory hierarchy and customize thread composition.
                 At runtime, a properly optimized version of the
                 application is executed based on the actual program
                 input. We perform a head-to-head comparison between the
                 Adaptic generated and hand-optimized CUDA programs. The
                 results show that Adaptic is capable of generating
                 codes that can perform on par with their hand-optimized
                 counterparts over certain input ranges and outperform
                 them when the input falls out of the hand-optimized
                 programs' ``comfort zone''. Furthermore, we show that
                 input-aware results are sustainable across different
                 GPU targets making it possible to write and optimize
                 applications once and run them anywhere.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bacon:2012:TTW,
  author =       "David F. Bacon and Perry Cheng and Sunil Shukla",
  title =        "And then there were none: a stall-free real-time
                 garbage collector for reconfigurable hardware",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "23--34",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254068",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Programmers are turning to radical architectures such
                 as reconfigurable hardware (FPGAs) to achieve
                 performance. But such systems, programmed at a very low
                 level in languages with impoverished abstractions, are
                 orders of magnitude more complex to use than
                 conventional CPUs. The continued exponential increase
                 in transistors, combined with the desire to implement
                 ever more sophisticated algorithms, makes it imperative
                 that such systems be programmed at much higher levels
                 of abstraction. One of the fundamental high-level
                 language features is automatic memory management in the
                 form of garbage collection. We present the first
                 implementation of a complete garbage collector in
                 hardware (as opposed to previous ``hardware-assist''
                 techniques), using an FPGA and its on-chip memory.
                 Using a completely concurrent snapshot algorithm, it
                 provides single-cycle access to the heap, and never
                 stalls the mutator for even a single cycle, achieving a
                 deterministic mutator utilization (MMU) of 100\%. We
                 have synthesized the collector to hardware and show
                 that it never consumes more than 1\% of the logic
                 resources of a high-end FPGA. For comparison we also
                 implemented explicit (malloc/free) memory management,
                 and show that real-time collection is about 4\% to 17\%
                 slower than malloc, with comparable energy consumption.
                 Surprisingly, in hardware real-time collection is
                 superior to stop-the-world collection on every
                 performance axis, and even for stressful
                 micro-benchmarks can achieve 100\% MMU with heaps as
                 small as 1.01 to 1.4 times the absolute minimum.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Oliveira:2012:ICN,
  author =       "Bruno C. d. S. Oliveira and Tom Schrijvers and Wontae
                 Choi and Wonchan Lee and Kwangkeun Yi",
  title =        "The implicit calculus: a new foundation for generic
                 programming",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "35--44",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254070",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Generic programming (GP) is an increasingly important
                 trend in programming languages. Well-known GP
                 mechanisms, such as type classes and the C++0x concepts
                 proposal, usually combine two features: (1) a special
                 type of interfaces; and (2) implicit instantiation of
                 implementations of those interfaces. Scala implicits
                 are a GP language mechanism, inspired by type classes,
                 that break with the tradition of coupling implicit
                 instantiation with a special type of interface.
                 Instead, implicits provide only implicit instantiation,
                 which is generalized to work for any types. This turns
                 out to be quite powerful and useful to address many
                 limitations that show up in other GP mechanisms. This
                 paper synthesizes the key ideas of implicits formally
                 in a minimal and general core calculus called the
                 implicit calculus $ (\lambda \implies) $, and it shows
                 how to build source languages supporting implicit
                 instantiation on top of it. A novelty of the calculus
                 is its support for partial resolution and higher-order
                 rules (a feature that has been proposed before, but was
                 never formalized or implemented). Ultimately, the
                 implicit calculus provides a formal model of implicits,
                 which can be used by language designers to study and
                 inform implementations of similar mechanisms in their
                 own languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kawaguchi:2012:DPL,
  author =       "Ming Kawaguchi and Patrick Rondon and Alexander Bakst
                 and Ranjit Jhala",
  title =        "Deterministic parallelism via liquid effects",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "45--54",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254071",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Shared memory multithreading is a popular approach to
                 parallel programming, but also fiendishly hard to get
                 right. We present Liquid Effects, a type-and-effect
                 system based on refinement types which allows for
                 fine-grained, low-level, shared memory multi-threading
                 while statically guaranteeing that a program is
                 deterministic. Liquid Effects records the effect of an
                 expression as a for- mula in first-order logic, making
                 our type-and-effect system highly expressive. Further,
                 effects like Read and Write are recorded in Liquid
                 Effects as ordinary uninterpreted predicates, leaving
                 the effect system open to extension by the user. By
                 building our system as an extension to an existing
                 dependent refinement type system, our system gains
                 precise value- and branch-sensitive reasoning about
                 effects. Finally, our system exploits the Liquid Types
                 refinement type inference technique to automatically
                 infer refinement types and effects. We have implemented
                 our type-and-effect checking techniques in CSOLVE, a
                 refinement type inference system for C programs. We
                 demonstrate how CSOLVE uses Liquid Effects to prove the
                 determinism of a variety of benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Guerraoui:2012:SL,
  author =       "Rachid Guerraoui and Viktor Kuncak and Giuliano Losa",
  title =        "Speculative linearizability",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "55--66",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254072",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Linearizability is a key design methodology for
                 reasoning about implementations of concurrent abstract
                 data types in both shared memory and message passing
                 systems. It provides the illusion that operations
                 execute sequentially and fault-free, despite the
                 asynchrony and faults inherent to a concurrent system,
                 especially a distributed one. A key property of
                 linearizability is inter-object composability: a system
                 composed of linearizable objects is itself
                 linearizable. However, devising linearizable objects is
                 very difficult, requiring complex algorithms to work
                 correctly under general circumstances, and often
                 resulting in bad average-case behavior. Concurrent
                 algorithm designers therefore resort to speculation:
                 optimizing algorithms to handle common scenarios more
                 efficiently. The outcome are even more complex
                 protocols, for which it is no longer tractable to prove
                 their correctness. To simplify the design of efficient
                 yet robust linearizable protocols, we propose a new
                 notion: speculative linearizability. This property is
                 as general as linearizability, yet it allows
                 intra-object composability: the correctness of
                 independent protocol phases implies the correctness of
                 their composition. In particular, it allows the
                 designer to focus solely on the proof of an
                 optimization and derive the correctness of the overall
                 protocol from the correctness of the existing,
                 non-optimized one. Our notion of protocol phases allows
                 processes to independently switch from one phase to
                 another, without requiring them to reach agreement to
                 determine the change of a phase. To illustrate the
                 applicability of our methodology, we show how examples
                 of speculative algorithms for shared memory and
                 asynchronous message passing naturally fit into our
                 framework. We rigorously define speculative
                 linearizability and prove our intra-object composition
                 theorem in a trace-based as well as an automaton-based
                 model. To obtain a further degree of confidence, we
                 also formalize and mechanically check the theorem in
                 the automaton-based model, using the I/O automata
                 framework within the Isabelle interactive proof
                 assistant. We expect our framework to enable, for the
                 first time, scalable specifications and mechanical
                 proofs of speculative implementations of linearizable
                 objects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zaparanuks:2012:AP,
  author =       "Dmitrijs Zaparanuks and Matthias Hauswirth",
  title =        "Algorithmic profiling",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "67--76",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254074",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Traditional profilers identify where a program spends
                 most of its resources. They do not provide information
                 about why the program spends those resources or about
                 how resource consumption would change for different
                 program inputs. In this paper we introduce the idea of
                 algorithmic profiling. While a traditional profiler
                 determines a set of measured cost values, an
                 algorithmic profiler determines a cost function. It
                 does that by automatically determining the ``inputs''
                 of a program, by measuring the program's ``cost'' for
                 any given input, and by inferring an empirical cost
                 function.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jin:2012:UDR,
  author =       "Guoliang Jin and Linhai Song and Xiaoming Shi and Joel
                 Scherpelz and Shan Lu",
  title =        "Understanding and detecting real-world performance
                 bugs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "77--88",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254075",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Developers frequently use inefficient code sequences
                 that could be fixed by simple patches. These
                 inefficient code sequences can cause significant
                 performance degradation and resource waste, referred to
                 as performance bugs. Meager increases in single
                 threaded performance in the multi-core era and
                 increasing emphasis on energy efficiency call for more
                 effort in tackling performance bugs. This paper
                 conducts a comprehensive study of 110 real-world
                 performance bugs that are randomly sampled from five
                 representative software suites (Apache, Chrome, GCC,
                 Mozilla, and MySQL). The findings of this study provide
                 guidance for future work to avoid, expose, detect, and
                 fix performance bugs. Guided by our characteristics
                 study, efficiency rules are extracted from 25 patches
                 and are used to detect performance bugs. 332 previously
                 unknown performance problems are found in the latest
                 versions of MySQL, Apache, and Mozilla applications,
                 including 219 performance problems found by applying
                 rules across applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Coppa:2012:ISP,
  author =       "Emilio Coppa and Camil Demetrescu and Irene Finocchi",
  title =        "Input-sensitive profiling",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "89--98",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254076",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "In this paper we present a profiling methodology and
                 toolkit for helping developers discover hidden
                 asymptotic inefficiencies in the code. From one or more
                 runs of a program, our profiler automatically measures
                 how the performance of individual routines scales as a
                 function of the input size, yielding clues to their
                 growth rate. The output of the profiler is, for each
                 executed routine of the program, a set of tuples that
                 aggregate performance costs by input size. The
                 collected profiles can be used to produce performance
                 plots and derive trend functions by statistical curve
                 fitting or bounding techniques. A key feature of our
                 method is the ability to automatically measure the size
                 of the input given to a generic code fragment: to this
                 aim, we propose an effective metric for estimating the
                 input size of a routine and show how to compute it
                 efficiently. We discuss several case studies, showing
                 that our approach can reveal asymptotic bottlenecks
                 that other profilers may fail to detect and
                 characterize the workload and behavior of individual
                 routines in the context of real applications. To prove
                 the feasibility of our techniques, we implemented a
                 Valgrind tool called aprof and performed an extensive
                 experimental evaluation on the SPEC CPU2006 benchmarks.
                 Our experiments show that aprof delivers comparable
                 performance to other prominent Valgrind tools, and can
                 generate informative plots even from single runs on
                 typical workloads for most algorithmically-critical
                 routines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhang:2012:LBC,
  author =       "Danfeng Zhang and Aslan Askarov and Andrew C. Myers",
  title =        "Language-based control and mitigation of timing
                 channels",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "99--110",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254078",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "We propose a new language-based approach to mitigating
                 timing channels. In this language, well-typed programs
                 provably leak only a bounded amount of information over
                 time through external timing channels. By incorporating
                 mechanisms for predictive mitigation of timing
                 channels, this approach also permits a more expressive
                 programming model. Timing channels arising from
                 interaction with underlying hardware features such as
                 instruction caches are controlled. Assumptions about
                 the underlying hardware are explicitly formalized,
                 supporting the design of hardware that efficiently
                 controls timing channels. One such hardware design is
                 modeled and used to show that timing channels can be
                 controlled in some simple programs of real-world
                 significance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chiw:2012:DPD,
  author =       "Charisee Chiw and Gordon Kindlmann and John Reppy and
                 Lamont Samuels and Nick Seltzer",
  title =        "{Diderot}: a parallel {DSL} for image analysis and
                 visualization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "111--120",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254079",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Research scientists and medical professionals use
                 imaging technology, such as computed tomography (CT)
                 and magnetic resonance imaging (MRI) to measure a wide
                 variety of biological and physical objects. The
                 increasing sophistication of imaging technology creates
                 demand for equally sophisticated computational
                 techniques to analyze and visualize the image data.
                 Analysis and visualization codes are often crafted for
                 a specific experiment or set of images, thus imaging
                 scientists need support for quickly developing codes
                 that are reliable, robust, and efficient. In this
                 paper, we present the design and implementation of
                 Diderot, which is a parallel domain-specific language
                 for biomedical image analysis and visualization.
                 Diderot supports a high-level model of computation that
                 is based on continuous tensor fields. These tensor
                 fields are reconstructed from discrete image data using
                 separable convolution kernels, but may also be defined
                 by applying higher-order operations, such as
                 differentiation ({$ \Delta $}). Early experiments
                 demonstrate that Diderot provides both a high-level
                 concise notation for image analysis and visualization
                 algorithms, as well as high sequential and parallel
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cartey:2012:SGC,
  author =       "Luke Cartey and Rune Lyngs{\o} and Oege de Moor",
  title =        "Synthesising graphics card programs from {DSLs}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "121--132",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254080",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Over the last five years, graphics cards have become a
                 tempting target for scientific computing, thanks to
                 unrivaled peak performance, often producing a runtime
                 speed-up of x10 to x25 over comparable CPU solutions.
                 However, this increase can be difficult to achieve, and
                 doing so often requires a fundamental rethink. This is
                 especially problematic in scientific computing, where
                 experts do not want to learn yet another architecture.
                 In this paper we develop a method for automatically
                 parallelising recursive functions of the sort found in
                 scientific papers. Using a static analysis of the
                 function dependencies we identify sets --- partitions
                 --- of independent elements, which we use to synthesise
                 an efficient GPU implementation using polyhedral code
                 generation techniques. We then augment our language
                 with DSL extensions to support a wider variety of
                 applications, and demonstrate the effectiveness of this
                 with three case studies, showing significant
                 performance improvement over equivalent CPU methods,
                 and similar efficiency to hand-tuned GPU
                 implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Raman:2012:PSF,
  author =       "Arun Raman and Ayal Zaks and Jae W. Lee and David I.
                 August",
  title =        "{Parcae}: a system for flexible parallel execution",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "133--144",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254082",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Workload, platform, and available resources constitute
                 a parallel program's execution environment. Most
                 parallelization efforts statically target an
                 anticipated range of environments, but performance
                 generally degrades outside that range. Existing
                 approaches address this problem with dynamic tuning but
                 do not optimize a multiprogrammed system holistically.
                 Further, they either require manual programming effort
                 or are limited to array-based data-parallel programs.
                 This paper presents Parcae, a generally applicable
                 automatic system for platform-wide dynamic tuning.
                 Parcae includes (i) the Nona compiler, which creates
                 flexible parallel programs whose tasks can be
                 efficiently reconfigured during execution; (ii) the
                 Decima monitor, which measures resource availability
                 and system performance to detect change in the
                 environment; and (iii) the Morta executor, which cuts
                 short the life of executing tasks, replacing them with
                 other functionally equivalent tasks better suited to
                 the current environment. Parallel programs made
                 flexible by Parcae outperform original parallel
                 implementations in many interesting scenarios.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tripp:2012:JEP,
  author =       "Omer Tripp and Roman Manevich and John Field and Mooly
                 Sagiv",
  title =        "{JANUS}: exploiting parallelism via hindsight",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "145--156",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254083",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "This paper addresses the problem of reducing
                 unnecessary conflicts in optimistic synchronization.
                 Optimistic synchronization must ensure that any two
                 concurrently executing transactions that commit are
                 properly synchronized. Conflict detection is an
                 approximate check for this condition. For efficiency,
                 the traditional approach to conflict detection
                 conservatively checks that the memory locations
                 mutually accessed by two concurrent transactions are
                 accessed only for reading. We present JANUS, a
                 parallelization system that performs conflict detection
                 by considering sequences of operations and their
                 composite effect on the system's state. This is done
                 efficiently, such that the runtime overhead due to
                 conflict detection is on a par with that of
                 write-conflict-based detection. In certain common
                 scenarios, this mode of refinement dramatically
                 improves the precision of conflict detection, thereby
                 reducing the number of false conflicts. Our empirical
                 evaluation of JANUS shows that this precision gain
                 reduces the abort rate by an order of magnitude (22x on
                 average), and achieves a speedup of up to 2.5x, on a
                 suite of real-world benchmarks where no parallelism is
                 exploited by the standard approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Turon:2012:REC,
  author =       "Aaron Turon",
  title =        "{Reagents}: expressing and composing fine-grained
                 concurrency",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "157--168",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254084",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Efficient communication and synchronization is crucial
                 for fine grained parallelism. Libraries providing such
                 features, while indispensable, are difficult to write,
                 and often cannot be tailored or composed to meet the
                 needs of specific users. We introduce reagents, a set
                 of combinators for concisely expressing concurrency
                 algorithms. Reagents scale as well as their hand-coded
                 counterparts, while providing the composability
                 existing libraries lack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Carbin:2012:PAP,
  author =       "Michael Carbin and Deokhwan Kim and Sasa Misailovic
                 and Martin C. Rinard",
  title =        "Proving acceptability properties of relaxed
                 nondeterministic approximate programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "169--180",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254086",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Approximate program transformations such as skipping
                 tasks [29, 30], loop perforation [21, 22, 35],
                 reduction sampling [38], multiple selectable
                 implementations [3, 4, 16, 38], dynamic knobs [16],
                 synchronization elimination [20, 32], approximate
                 function memoization [11],and approximate data types
                 [34] produce programs that can execute at a variety of
                 points in an underlying performance versus accuracy
                 tradeoff space. These transformed programs have the
                 ability to trade accuracy of their results for
                 increased performance by dynamically and
                 nondeterministically modifying variables that control
                 their execution. We call such transformed programs
                 relaxed programs because they have been extended with
                 additional nondeterminism to relax their semantics and
                 enable greater flexibility in their execution. We
                 present language constructs for developing and
                 specifying relaxed programs. We also present proof
                 rules for reasoning about properties [28] which the
                 program must satisfy to be acceptable. Our proof rules
                 work with two kinds of acceptability properties:
                 acceptability properties [28], which characterize
                 desired relationships between the values of variables
                 in the original and relaxed programs, and unary
                 acceptability properties, which involve values only
                 from a single (original or relaxed) program. The proof
                 rules support a staged reasoning approach in which the
                 majority of the reasoning effort works with the
                 original program. Exploiting the common structure that
                 the original and relaxed programs share, relational
                 reasoning transfers reasoning effort from the original
                 program to prove properties of the relaxed program. We
                 have formalized the dynamic semantics of our target
                 programming language and the proof rules in Coq and
                 verified that the proof rules are sound with respect to
                 the dynamic semantics. Our Coq implementation enables
                 developers to obtain fully machine-checked
                 verifications of their relaxed programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dillig:2012:AED,
  author =       "Isil Dillig and Thomas Dillig and Alex Aiken",
  title =        "Automated error diagnosis using abductive inference",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "181--192",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254087",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "When program verification tools fail to verify a
                 program, either the program is buggy or the report is a
                 false alarm. In this situation, the burden is on the
                 user to manually classify the report, but this task is
                 time-consuming, error-prone, and does not utilize facts
                 already proven by the analysis. We present a new
                 technique for assisting users in classifying error
                 reports. Our technique computes small, relevant queries
                 presented to a user that capture exactly the
                 information the analysis is missing to either discharge
                 or validate the error. Our insight is that identifying
                 these missing facts is an instance of the abductive
                 inference problem in logic, and we present a new
                 algorithm for computing the smallest and most general
                 abductions in this setting. We perform the first user
                 study to rigorously evaluate the accuracy and effort
                 involved in manual classification of error reports. Our
                 study demonstrates that our new technique is very
                 useful for improving both the speed and accuracy of
                 error report classification. Specifically, our approach
                 improves classification accuracy from 33\% to 90\% and
                 reduces the time programmers take to classify error
                 reports from approximately 5 minutes to under 1
                 minute.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kuznetsov:2012:ESM,
  author =       "Volodymyr Kuznetsov and Johannes Kinder and Stefan
                 Bucur and George Candea",
  title =        "Efficient state merging in symbolic execution",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "193--204",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254088",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Symbolic execution has proven to be a practical
                 technique for building automated test case generation
                 and bug finding tools. Nevertheless, due to state
                 explosion, these tools still struggle to achieve
                 scalability. Given a program, one way to reduce the
                 number of states that the tools need to explore is to
                 merge states obtained on different paths. Alas, doing
                 so increases the size of symbolic path conditions
                 (thereby stressing the underlying constraint solver)
                 and interferes with optimizations of the exploration
                 process (also referred to as search strategies). The
                 net effect is that state merging may actually lower
                 performance rather than increase it. We present a way
                 to automatically choose when and how to merge states
                 such that the performance of symbolic execution is
                 significantly increased. First, we present query count
                 estimation, a method for statically estimating the
                 impact that each symbolic variable has on solver
                 queries that follow a potential merge point; states are
                 then merged only when doing so promises to be
                 advantageous. Second, we present dynamic state merging,
                 a technique for merging states that interacts favorably
                 with search strategies in automated test case
                 generation and bug finding tools. Experiments on the 96
                 GNU Coreutils show that our approach consistently
                 achieves several orders of magnitude speedup over
                 previously published results. Our code and experimental
                 data are publicly available at http://cloud9.epfl.ch.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wu:2012:SPA,
  author =       "Jingyue Wu and Yang Tang and Gang Hu and Heming Cui
                 and Junfeng Yang",
  title =        "Sound and precise analysis of parallel programs
                 through schedule specialization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "205--216",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254090",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Parallel programs are known to be difficult to
                 analyze. A key reason is that they typically have an
                 enormous number of execution interleavings, or
                 schedules. Static analysis over all schedules requires
                 over-approximations, resulting in poor precision;
                 dynamic analysis rarely covers more than a tiny
                 fraction of all schedules. We propose an approach
                 called schedule specialization to analyze a parallel
                 program over only a small set of schedules for
                 precision, and then enforce these schedules at runtime
                 for soundness of the static analysis results. We build
                 a schedule specialization framework for C/C++
                 multithreaded programs that use Pthreads. Our framework
                 avoids the need to modify every analysis to be
                 schedule-aware by specializing a program into a simpler
                 program based on a schedule, so that the resultant
                 program can be analyzed with stock analyses for
                 improved precision. Moreover, our framework provides a
                 precise schedule-aware def-use analysis on memory
                 locations, enabling us to build three highly precise
                 analyses: an alias analyzer, a data-race detector, and
                 a path slicer. Evaluation on 17 programs, including 2
                 real-world programs and 15 popular benchmarks, shows
                 that analyses using our framework reduced may-aliases
                 by 61.9\%, false race reports by 69\%, and path slices
                 by 48.7\%; and detected 7 unknown bugs in well-checked
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Albarghouthi:2012:PTI,
  author =       "Aws Albarghouthi and Rahul Kumar and Aditya V. Nori
                 and Sriram K. Rajamani",
  title =        "Parallelizing top-down interprocedural analyses",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "217--228",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254091",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Modularity is a central theme in any scalable program
                 analysis. The core idea in a modular analysis is to
                 build summaries at procedure boundaries, and use the
                 summary of a procedure to analyze the effect of calling
                 it at its calling context. There are two ways to
                 perform a modular program analysis: (1) top-down and
                 (2) bottomup. A bottom-up analysis proceeds upwards
                 from the leaves of the call graph, and analyzes each
                 procedure in the most general calling context and
                 builds its summary. In contrast, a top-down analysis
                 starts from the root of the call graph, and proceeds
                 downward, analyzing each procedure in its calling
                 context. Top-down analyses have several applications in
                 verification and software model checking. However,
                 traditionally, bottom-up analyses have been easier to
                 scale and parallelize than top-down analyses. In this
                 paper, we propose a generic framework, BOLT, which uses
                 MapReduce style parallelism to scale top-down analyses.
                 In particular, we consider top-down analyses that are
                 demand driven, such as the ones used for software model
                 checking. In such analyses, each intraprocedural
                 analysis happens in the context of a reachability
                 query. A query Q over a procedure P results in query
                 tree that consists of sub-queries over the procedures
                 called by P. The key insight in BOLT is that the query
                 tree can be explored in parallel using MapReduce style
                 parallelism --- the map stage can be used to run a set
                 of enabled queries in parallel, and the reduce stage
                 can be used to manage inter-dependencies between
                 queries. Iterating the map and reduce stages
                 alternately, we can exploit the parallelism inherent in
                 top-down analyses. Another unique feature of BOLT is
                 that it is parameterized by the algorithm used for
                 intraprocedural analysis. Several kinds of analyses,
                 including may analyses, must analyses, and
                 may-must-analyses can be parallelized using BOLT. We
                 have implemented the BOLT framework and instantiated
                 the intraprocedural parameter with a may-must-analysis.
                 We have run BOLT on a test suite consisting of 45
                 Microsoft Windows device drivers and 150 safety
                 properties. Our results demonstrate an average speedup
                 of 3.71x and a maximum speedup of 7.4x (with 8 cores)
                 over a sequential analysis. Moreover, in several checks
                 where a sequential analysis fails, BOLT is able to
                 successfully complete its analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Oh:2012:DIS,
  author =       "Hakjoo Oh and Kihong Heo and Wonchan Lee and Woosuk
                 Lee and Kwangkeun Yi",
  title =        "Design and implementation of sparse global analyses
                 for {C}-like languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "229--238",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254092",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "In this article we present a general method for
                 achieving global static analyzers that are precise,
                 sound, yet also scalable. Our method generalizes the
                 sparse analysis techniques on top of the abstract
                 interpretation framework to support relational as well
                 as non-relational semantics properties for C-like
                 languages. We first use the abstract interpretation
                 framework to have a global static analyzer whose
                 scalability is unattended. Upon this underlying sound
                 static analyzer, we add our generalized sparse analysis
                 techniques to improve its scalability while preserving
                 the precision of the underlying analysis. Our framework
                 determines what to prove to guarantee that the
                 resulting sparse version should preserve the precision
                 of the underlying analyzer. We formally present our
                 framework; we present that existing sparse analyses are
                 all restricted instances of our framework; we show more
                 semantically elaborate design examples of sparse
                 non-relational and relational static analyses; we
                 present their implementation results that scale to
                 analyze up to one million lines of C programs. We also
                 show a set of implementation techniques that turn out
                 to be critical to economically support the sparse
                 analysis process.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hackett:2012:FPH,
  author =       "Brian Hackett and Shu-yu Guo",
  title =        "Fast and precise hybrid type inference for
                 {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "239--250",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254094",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "JavaScript performance is often bound by its
                 dynamically typed nature. Compilers do not have access
                 to static type information, making generation of
                 efficient, type-specialized machine code difficult. We
                 seek to solve this problem by inferring types. In this
                 paper we present a hybrid type inference algorithm for
                 JavaScript based on points-to analysis. Our algorithm
                 is fast, in that it pays for itself in the
                 optimizations it enables. Our algorithm is also
                 precise, generating information that closely reflects
                 the program's actual behavior even when analyzing
                 polymorphic code, by augmenting static analysis with
                 run-time type barriers. We showcase an implementation
                 for Mozilla Firefox's JavaScript engine, demonstrating
                 both performance gains and viability. Through
                 integration with the just-in-time (JIT) compiler in
                 Firefox, we have improved performance on major
                 benchmarks and JavaScript-heavy websites by up to 50\%.
                 Inference-enabled compilation is the default
                 compilation mode as of Firefox 9.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Petrov:2012:RDW,
  author =       "Boris Petrov and Martin Vechev and Manu Sridharan and
                 Julian Dolby",
  title =        "Race detection for {Web} applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "251--262",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254095",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Modern web pages are becoming increasingly
                 full-featured, and this additional functionality often
                 requires greater use of asynchrony. Unfortunately, this
                 asynchrony can trigger unexpected concurrency errors,
                 even though web page scripts are executed sequentially.
                 We present the first formulation of a happens-before
                 relation for common web platform features. Developing
                 this relation was a non-trivial task, due to complex
                 feature interactions and browser differences. We also
                 present a logical memory access model for web
                 applications that abstracts away browser implementation
                 details. Based on the above, we implemented WebRacer,
                 the first dynamic race detector for web applications.
                 WebRacer is implemented atop the production-quality
                 WebKit engine, enabling testing of full-featured web
                 sites. WebRacer can also simulate certain user actions,
                 exposing more races. We evaluated WebRacer by testing a
                 large set of Fortune 100 company web sites. We
                 discovered many harmful races, and also gained insights
                 into how developers handle asynchrony in practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fischer:2012:EDM,
  author =       "Jeffrey Fischer and Rupak Majumdar and Shahram
                 Esmaeilsabzali",
  title =        "{Engage}: a deployment management system",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "263--274",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254096",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Many modern applications are built by combining
                 independently developed packages and services that are
                 distributed over many machines with complex
                 inter-dependencies. The assembly, installation, and
                 management of such applications is hard, and usually
                 performed either manually or by writing customized
                 scripts. We present Engage, a system for configuring,
                 installing, and managing complex application stacks.
                 Engage consists of three components: a domain-specific
                 model to describe component metadata and
                 inter-component dependencies; a constraint-based
                 algorithm that takes a partial installation
                 specification and computes a full installation plan;
                 and a runtime system that co-ordinates the deployment
                 of the application across multiple machines and manages
                 the deployed system. By explicitly modeling
                 configuration metadata and inter-component
                 dependencies, Engage enables static checking of
                 application configurations and automated,
                 constraint-driven, generation of installation plans
                 across multiple machines. This reduces the tedious
                 manual process of application configuration,
                 installation, and management. We have implemented
                 Engage and we have used it to successfully host a
                 number of applications. We describe our experiences in
                 using Engage to manage a generic platform that hosts
                 Django applications in the cloud or on premises.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Perelman:2012:TDC,
  author =       "Daniel Perelman and Sumit Gulwani and Thomas Ball and
                 Dan Grossman",
  title =        "Type-directed completion of partial expressions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "275--286",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254098",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Modern programming frameworks provide enormous
                 libraries arranged in complex structures, so much so
                 that a large part of modern programming is searching
                 for APIs that surely exist ``somewhere in an unfamiliar
                 part of the framework.'' We present a novel way of
                 phrasing a search for an unknown API: the programmer
                 simply writes an expression leaving holes for the parts
                 they do not know. We call these expressions partial
                 expressions. We present an efficient algorithm that
                 produces likely completions ordered by a ranking scheme
                 based primarily on the similarity of the types of the
                 APIs suggested to the types of the known expressions.
                 This gives a powerful language for both API discovery
                 and code completion with a small impedance mismatch
                 from writing code. In an automated experiment on mature
                 C\# projects, we show our algorithm can place the
                 intended expression in the top 10 choices over 80\% of
                 the time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{hunEom:2012:SSJ,
  author =       "Yong hun Eom and Brian Demsky",
  title =        "Self-stabilizing {Java}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "287--298",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254099",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Self-stabilizing programs automatically recover from
                 state corruption caused by software bugs and other
                 sources to reach the correct state. A number of
                 applications are inherently self-stabilizing---such
                 programs typically overwrite all non-constant data with
                 new input data. We present a type system and static
                 analyses that together check whether a program is
                 self-stabilizing. We combine this with a code
                 generation strategy that ensures that a program
                 continues executing long enough to self-stabilize. Our
                 experience using SJava indicates that (1) SJava
                 annotations are easy to write once one understands a
                 program and (2) SJava successfully checked that several
                 benchmarks were self-stabilizing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2012:TDA,
  author =       "Yan Chen and Joshua Dunfield and Umut A. Acar",
  title =        "Type-directed automatic incrementalization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "299--310",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254100",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Application data often changes slowly or incrementally
                 over time. Since incremental changes to input often
                 result in only small changes in output, it is often
                 feasible to respond to such changes asymptotically more
                 efficiently than by re-running the whole computation.
                 Traditionally, realizing such asymptotic efficiency
                 improvements requires designing problem-specific
                 algorithms known as dynamic or incremental algorithms,
                 which are often significantly more complicated than
                 conventional algorithms to design, analyze, implement,
                 and use. A long-standing open problem is to develop
                 techniques that automatically transform conventional
                 programs so that they correctly and efficiently respond
                 to incremental changes. In this paper, we describe a
                 significant step towards solving the problem of
                 automatic incrementalization: a programming language
                 and a compiler that can, given a few type annotations
                 describing what can change over time, compile a
                 conventional program that assumes its data to be static
                 (unchanging over time) to an incremental program. Based
                 on recent advances in self-adjusting computation,
                 including a theoretical proposal for translating purely
                 functional programs to self-adjusting programs, we
                 develop techniques for translating conventional
                 Standard ML programs to self-adjusting programs. By
                 extending the Standard ML language, we design a fully
                 featured programming language with higher-order
                 features, a module system, and a powerful type system,
                 and implement a compiler for this language. The
                 resulting programming language, LML, enables
                 translating conventional programs decorated with simple
                 type annotations into incremental programs that can
                 respond to changes in their data correctly and
                 efficiently. We evaluate the effectiveness of our
                 approach by considering a range of benchmarks involving
                 lists, vectors, and matrices, as well as a ray tracer.
                 For these benchmarks, our compiler incrementalizes
                 existing code with only trivial amounts of annotation.
                 The resulting programs are often asymptotically more
                 efficient, leading to orders of magnitude speedups in
                 practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sarkar:2012:SCC,
  author =       "Susmit Sarkar and Kayvan Memarian and Scott Owens and
                 Mark Batty and Peter Sewell and Luc Maranget and Jade
                 Alglave and Derek Williams",
  title =        "Synchronising {C\slash C++} and {POWER}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "311--322",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254102",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Shared memory concurrency relies on synchronisation
                 primitives: compare-and-swap,
                 load-reserve/store-conditional (aka LL/SC),
                 language-level mutexes, and so on. In a sequentially
                 consistent setting, or even in the TSO setting of x86
                 and Sparc, these have well-understood semantics. But in
                 the very relaxed settings of IBM\reg{}, POWER\reg{},
                 ARM, or C/C++, it remains surprisingly unclear exactly
                 what the programmer can depend on. This paper studies
                 relaxed-memory synchronisation. On the hardware side,
                 we give a clear semantic characterisation of the
                 load-reserve/store-conditional primitives as provided
                 by POWER multiprocessors, for the first time since they
                 were introduced 20 years ago; we cover their
                 interaction with relaxed loads, stores, barriers, and
                 dependencies. Our model, while not officially
                 sanctioned by the vendor, is validated by extensive
                 testing, comparing actual implementation behaviour
                 against an oracle generated from the model, and by
                 detailed discussion with IBM staff. We believe the ARM
                 semantics to be similar. On the software side, we prove
                 sound a proposed compilation scheme of the C/C++
                 synchronisation constructs to POWER, including C/C++
                 spinlock mutexes, fences, and read-modify-write
                 operations, together with the simpler atomic operations
                 for which soundness is already known from our previous
                 work; this is a first step in verifying concurrent
                 algorithms that use load-reserve/store-conditional with
                 respect to a realistic semantics. We also build
                 confidence in the C/C++ model in its own terms, fixing
                 some omissions and contributing to the C standards
                 committee adoption of the C++11 concurrency model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gazzillo:2012:SPA,
  author =       "Paul Gazzillo and Robert Grimm",
  title =        "{SuperC}: parsing all of {C} by taming the
                 preprocessor",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "323--334",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254103",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "C tools, such as source browsers, bug finders, and
                 automated refactorings, need to process two languages:
                 C itself and the preprocessor. The latter improves
                 expressivity through file includes, macros, and static
                 conditionals. But it operates only on tokens, making it
                 hard to even parse both languages. This paper presents
                 a complete, performant solution to this problem. First,
                 a configuration-preserving preprocessor resolves
                 includes and macros yet leaves static conditionals
                 intact, thus preserving a program's variability. To
                 ensure completeness, we analyze all interactions
                 between preprocessor features and identify techniques
                 for correctly handling them. Second, a
                 configuration-preserving parser generates a well-formed
                 AST with static choice nodes for conditionals. It forks
                 new subparsers when encountering static conditionals
                 and merges them again after the conditionals. To ensure
                 performance, we present a simple algorithm for
                 table-driven Fork-Merge LR parsing and four novel
                 optimizations. We demonstrate the effectiveness of our
                 approach on the x86 Linux kernel.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Regehr:2012:TCR,
  author =       "John Regehr and Yang Chen and Pascal Cuoq and Eric
                 Eide and Chucky Ellison and Xuejun Yang",
  title =        "Test-case reduction for {C} compiler bugs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "335--346",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254104",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "To report a compiler bug, one must often find a small
                 test case that triggers the bug. The existing approach
                 to automated test-case reduction, delta debugging,
                 works by removing substrings of the original input; the
                 result is a concatenation of substrings that delta
                 cannot remove. We have found this approach less than
                 ideal for reducing C programs because it typically
                 yields test cases that are too large or even invalid
                 (relying on undefined behavior). To obtain small and
                 valid test cases consistently, we designed and
                 implemented three new, domain-specific test-case
                 reducers. The best of these is based on a novel
                 framework in which a generic fixpoint computation
                 invokes modular transformations that perform reduction
                 operations. This reducer produces outputs that are, on
                 average, more than 25 times smaller than those produced
                 by our other reducers or by the existing reducer that
                 is most commonly used by compiler developers. We
                 conclude that effective program reduction requires more
                 than straightforward delta debugging.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liu:2012:CFE,
  author =       "Jun Liu and Yuanrui Zhang and Ohyoung Jang and Wei
                 Ding and Mahmut Kandemir",
  title =        "A compiler framework for extracting superword level
                 parallelism",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "347--358",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254106",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "SIMD (single-instruction multiple-data) instruction
                 set extensions are quite common today in both high
                 performance and embedded microprocessors, and enable
                 the exploitation of a specific type of data parallelism
                 called SLP (Superword Level Parallelism). While prior
                 research shows that significant performance savings are
                 possible when SLP is exploited, placing SIMD
                 instructions in an application code manually can be
                 very difficult and error prone. In this paper, we
                 propose a novel automated compiler framework for
                 improving superword level parallelism exploitation. The
                 key part of our framework consists of two stages:
                 superword statement generation and data layout
                 optimization. The first stage is our main contribution
                 and has two phases, statement grouping and statement
                 scheduling, of which the primary goals are to increase
                 SIMD parallelism and, more importantly, capture more
                 superword reuses among the superword statements through
                 global data access and reuse pattern analysis. Further,
                 as a complementary optimization, our data layout
                 optimization organizes data in memory space such that
                 the price of memory operations for SLP is minimized.
                 The results from our compiler implementation and tests
                 on two systems indicate performance improvements as
                 high as 15.2\% over a state-of-the-art SLP optimization
                 algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Johnson:2012:SSP,
  author =       "Nick P. Johnson and Hanjun Kim and Prakash Prabhu and
                 Ayal Zaks and David I. August",
  title =        "Speculative separation for privatization and
                 reductions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "359--370",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254107",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Automatic parallelization is a promising strategy to
                 improve application performance in the multicore era.
                 However, common programming practices such as the reuse
                 of data structures introduce artificial constraints
                 that obstruct automatic parallelization. Privatization
                 relieves these constraints by replicating data
                 structures, thus enabling scalable parallelization.
                 Prior privatization schemes are limited to arrays and
                 scalar variables because they are sensitive to the
                 layout of dynamic data structures. This work presents
                 Privateer, the first fully automatic privatization
                 system to handle dynamic and recursive data structures,
                 even in languages with unrestricted pointers. To reduce
                 sensitivity to memory layout, Privateer speculatively
                 separates memory objects. Privateer's lightweight
                 runtime system validates speculative separation and
                 speculative privatization to ensure correct parallel
                 execution. Privateer enables automatic parallelization
                 of general-purpose C/C++ applications, yielding a
                 geomean whole-program speedup of 11.4x over best
                 sequential execution on 24 cores, while non-speculative
                 parallelization yields only 0.93x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Holewinski:2012:DTB,
  author =       "Justin Holewinski and Ragavendar Ramamurthi and Mahesh
                 Ravishankar and Naznin Fauzia and Louis-No{\"e}l
                 Pouchet and Atanas Rountev and P. Sadayappan",
  title =        "Dynamic trace-based analysis of vectorization
                 potential of applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "371--382",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254108",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Recent hardware trends with GPUs and the increasing
                 vector lengths of SSE-like ISA extensions for multicore
                 CPUs imply that effective exploitation of SIMD
                 parallelism is critical for achieving high performance
                 on emerging and future architectures. A vast majority
                 of existing applications were developed without any
                 attention by their developers towards effective
                 vectorizability of the codes. While developers of
                 production compilers such as GNU gcc, Intel icc, PGI
                 pgcc, and IBM xlc have invested considerable effort and
                 made significant advances in enhancing automatic
                 vectorization capabilities, these compilers still
                 cannot effectively vectorize many existing scientific
                 and engineering codes. It is therefore of considerable
                 interest to analyze existing applications to assess the
                 inherent latent potential for SIMD parallelism,
                 exploitable through further compiler advances and/or
                 via manual code changes. In this paper we develop an
                 approach to infer a program's SIMD parallelization
                 potential by analyzing the dynamic data-dependence
                 graph derived from a sequential execution trace. By
                 considering only the observed run-time data dependences
                 for the trace, and by relaxing the execution order of
                 operations to allow any dependence-preserving
                 reordering, we can detect potential SIMD parallelism
                 that may otherwise be missed by more conservative
                 compile-time analyses. We show that for several
                 benchmarks our tool discovers regions of code within
                 computationally-intensive loops that exhibit high
                 potential for SIMD parallelism but are not vectorized
                 by state-of-the-art compilers. We present several case
                 studies of the use of the tool, both in identifying
                 opportunities to enhance the transformation
                 capabilities of vectorizing compilers, as well as in
                 pointing to code regions to manually modify in order to
                 enable auto-vectorization and performance improvement
                 by existing compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Leung:2012:VGK,
  author =       "Alan Leung and Manish Gupta and Yuvraj Agarwal and
                 Rajesh Gupta and Ranjit Jhala and Sorin Lerner",
  title =        "Verifying {GPU} kernels by test amplification",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "383--394",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254110",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "We present a novel technique for verifying properties
                 of data parallel GPU programs via test amplification.
                 The key insight behind our work is that we can use the
                 technique of static information flow to amplify the
                 result of a single test execution over the set of all
                 inputs and interleavings that affect the property being
                 verified. We empirically demonstrate the effectiveness
                 of test amplification for verifying race-freedom and
                 determinism over a large number of standard GPU
                 kernels, by showing that the result of verifying a
                 single dynamic execution can be amplified over the
                 massive space of possible data inputs and thread
                 interleavings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Morrisett:2012:RBF,
  author =       "Greg Morrisett and Gang Tan and Joseph Tassarotti and
                 Jean-Baptiste Tristan and Edward Gan",
  title =        "{RockSalt}: better, faster, stronger {SFI} for the
                 x86",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "395--404",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254111",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Software-based fault isolation (SFI), as used in
                 Google's Native Client (NaCl), relies upon a
                 conceptually simple machine-code analysis to enforce a
                 security policy. But for complicated architectures such
                 as the x86, it is all too easy to get the details of
                 the analysis wrong. We have built a new checker that is
                 smaller, faster, and has a much reduced trusted
                 computing base when compared to Google's original
                 analysis. The key to our approach is automatically
                 generating the bulk of the analysis from a declarative
                 description which we relate to a formal model of a
                 subset of the x86 instruction set architecture. The x86
                 model, developed in Coq, is of independent interest and
                 should be usable for a wide range of machine-level
                 verification tasks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Grebenshchikov:2012:SSV,
  author =       "Sergey Grebenshchikov and Nuno P. Lopes and Corneliu
                 Popeea and Andrey Rybalchenko",
  title =        "Synthesizing software verifiers from proof rules",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "405--416",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254112",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Automatically generated tools can significantly
                 improve programmer productivity. For example, parsers
                 and dataflow analyzers can be automatically generated
                 from declarative specifications in the form of
                 grammars, which tremendously simplifies the task of
                 implementing a compiler. In this paper, we present a
                 method for the automatic synthesis of software
                 verification tools. Our synthesis procedure takes as
                 input a description of the employed proof rule, e.g.,
                 program safety checking via inductive invariants, and
                 produces a tool that automatically discovers the
                 auxiliary assertions required by the proof rule, e.g.,
                 inductive loop invariants and procedure summaries. We
                 rely on a (standard) representation of proof rules
                 using recursive equations over the auxiliary
                 assertions. The discovery of auxiliary assertions,
                 i.e., solving the equations, is based on an iterative
                 process that extrapolates solutions obtained for
                 finitary unrollings of equations. We show how our
                 method synthesizes automatic safety and liveness
                 verifiers for programs with procedures, multi-threaded
                 programs, and functional programs. Our experimental
                 comparison of the resulting verifiers with existing
                 state-of-the-art verification tools confirms the
                 practicality of the approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hawkins:2012:CDR,
  author =       "Peter Hawkins and Alex Aiken and Kathleen Fisher and
                 Martin Rinard and Mooly Sagiv",
  title =        "Concurrent data representation synthesis",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "417--428",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254114",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "We describe an approach for synthesizing data
                 representations for concurrent programs. Our compiler
                 takes as input a program written using concurrent
                 relations and synthesizes a representation of the
                 relations as sets of cooperating data structures as
                 well as the placement and acquisition of locks to
                 synchronize concurrent access to those data structures.
                 The resulting code is correct by construction:
                 individual relational operations are implemented
                 correctly and the aggregate set of operations is
                 serializable and deadlock free. The relational
                 specification also permits a high-level optimizer to
                 choose the best performing of many possible legal data
                 representations and locking strategies, which we
                 demonstrate with an experiment autotuning a graph
                 benchmark.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liu:2012:DSR,
  author =       "Feng Liu and Nayden Nedev and Nedyalko Prisadnikov and
                 Martin Vechev and Eran Yahav",
  title =        "Dynamic synthesis for relaxed memory models",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "429--440",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254115",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Modern architectures implement relaxed memory models
                 which may reorder memory operations or execute them
                 non-atomically. Special instructions called memory
                 fences are provided, allowing control of this behavior.
                 To implement a concurrent algorithm for a modern
                 architecture, the programmer is forced to manually
                 reason about subtle relaxed behaviors and figure out
                 ways to control these behaviors by adding fences to the
                 program. Not only is this process time consuming and
                 error-prone, but it has to be repeated every time the
                 implementation is ported to a different architecture.
                 In this paper, we present the first scalable framework
                 for handling real-world concurrent algorithms running
                 on relaxed architectures. Given a concurrent C program,
                 a safety specification, and a description of the memory
                 model, our framework tests the program on the memory
                 model to expose violations of the specification, and
                 synthesizes a set of necessary ordering constraints
                 that prevent these violations. The ordering constraints
                 are then realized as additional fences in the program.
                 We implemented our approach in a tool called DFence
                 based on LLVM and used it to infer fences in a number
                 of concurrent algorithms. Using DFence, we perform the
                 first in-depth study of the interaction between fences
                 in real-world concurrent C programs, correctness
                 criteria such as sequential consistency and
                 linearizability, and memory models such as TSO and PSO,
                 yielding many interesting observations. We believe that
                 this is the first tool that can handle programs at the
                 scale and complexity of a lock-free memory allocator.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Godefroid:2012:ASS,
  author =       "Patrice Godefroid and Ankur Taly",
  title =        "Automated synthesis of symbolic instruction encodings
                 from {I/O} samples",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "441--452",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254116",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Symbolic execution is a key component of precise
                 binary program analysis tools. We discuss how to
                 automatically boot-strap the construction of a symbolic
                 execution engine for a processor instruction set such
                 as x86, x64 or ARM. We show how to automatically
                 synthesize symbolic representations of individual
                 processor instructions from input/output examples and
                 express them as bit-vector constraints. We present and
                 compare various synthesis algorithms and instruction
                 sampling strategies. We introduce a new synthesis
                 algorithm based on smart sampling which we show is one
                 to two orders of magnitude faster than previous
                 synthesis algorithms in our context. With this new
                 algorithm, we can automatically synthesize bit-vector
                 circuits for over 500 x86 instructions (8/16/32-bits,
                 outputs, EFLAGS) using only 6 synthesis templates and
                 in less than two hours using the Z3 SMT solver on a
                 regular machine. During this work, we also discovered
                 several inconsistencies across x86 processors, errors
                 in the x86 Intel spec, and several bugs in previous
                 manually-written x86 instruction handlers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Benz:2012:DPA,
  author =       "Florian Benz and Andreas Hildebrandt and Sebastian
                 Hack",
  title =        "A dynamic program analysis to find floating-point
                 accuracy problems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "453--462",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254118",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Programs using floating-point arithmetic are prone to
                 accuracy problems caused by rounding and catastrophic
                 cancellation. These phenomena provoke bugs that are
                 notoriously hard to track down: the program does not
                 necessarily crash and the results are not necessarily
                 obviously wrong, but often subtly inaccurate. Further
                 use of these values can lead to catastrophic errors. In
                 this paper, we present a dynamic program analysis that
                 supports the programmer in finding accuracy problems.
                 Our analysis uses binary translation to perform every
                 floating-point computation side by side in higher
                 precision. Furthermore, we use a lightweight slicing
                 approach to track the evolution of errors. We evaluate
                 our analysis by demonstrating that it catches
                 well-known floating-point accuracy problems and by
                 analyzing the Spec CFP2006 floating-point benchmark. In
                 the latter, we show how our tool tracks down a
                 catastrophic cancellation that causes a complete loss
                 of accuracy leading to a meaningless program result.
                 Finally, we apply our program to a complex, real-world
                 bioinformatics application in which our program
                 detected a serious cancellation. Correcting the
                 instability led not only to improved quality of the
                 result, but also to an improvement of the program's run
                 time.In this paper, we present a dynamic program
                 analysis that supports the programmer in finding
                 accuracy problems. Our analysis uses binary translation
                 to perform every floating-point computation side by
                 side in higher precision. Furthermore, we use a
                 lightweight slicing approach to track the evolution of
                 errors. We evaluate our analysis by demonstrating that
                 it catches well-known floating-point accuracy problems
                 and by analyzing the SpecfiCFP2006 floating-point
                 benchmark. In the latter, we show how our tool tracks
                 down a catastrophic cancellation that causes a complete
                 loss of accuracy leading to a meaningless program
                 result. Finally, we apply our program to a complex,
                 real-world bioinformatics application in which our
                 program detected a serious cancellation. Correcting the
                 instability led not only to improved quality of the
                 result, but also to an improvement of the program's run
                 time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lee:2012:CHP,
  author =       "Dongyoon Lee and Peter M. Chen and Jason Flinn and
                 Satish Narayanasamy",
  title =        "{Chimera}: hybrid program analysis for determinism",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "463--474",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254119",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Chimera uses a new hybrid program analysis to provide
                 deterministic replay for commodity multiprocessor
                 systems. Chimera leverages the insight that it is easy
                 to provide deterministic multiprocessor replay for
                 data-race-free programs (one can just record
                 non-deterministic inputs and the order of
                 synchronization operations), so if we can somehow
                 transform an arbitrary program to be data-race-free,
                 then we can provide deterministic replay cheaply for
                 that program. To perform this transformation, Chimera
                 uses a sound static data-race detector to find all
                 potential data-races. It then instruments pairs of
                 potentially racing instructions with a weak-lock, which
                 provides sufficient guarantees to allow deterministic
                 replay but does not guarantee mutual exclusion.
                 Unsurprisingly, a large fraction of data-races found by
                 the static tool are false data-races, and instrumenting
                 them each of them with a weak-lock results in
                 prohibitively high overhead. Chimera drastically
                 reduces this cost from 53x to 1.39x by increasing the
                 granularity of weak-locks without significantly
                 compromising on parallelism. This is achieved by
                 employing a combination of profiling and symbolic
                 analysis techniques that target the sources of
                 imprecision in the static data-race detector. We find
                 that performance overhead for deterministic recording
                 is 2.4\% on average for Apache and desktop applications
                 and about 86\% for scientific applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{deKruijf:2012:SAC,
  author =       "Marc A. de Kruijf and Karthikeyan Sankaralingam and
                 Somesh Jha",
  title =        "Static analysis and compiler design for idempotent
                 processing",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "475--486",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254120",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Recovery functionality has many applications in
                 computing systems, from speculation recovery in modern
                 microprocessors to fault recovery in high-reliability
                 systems. Modern systems commonly recover using
                 checkpoints. However, checkpoints introduce overheads,
                 add complexity, and often save more state than
                 necessary. This paper develops a novel compiler
                 technique to recover program state without the
                 overheads of explicit checkpoints. The technique breaks
                 programs into idempotent regions ---regions that can be
                 freely re-executed---which allows recovery without
                 checkpointed state. Leveraging the property of
                 idempotence, recovery can be obtained by simple
                 re-execution. We develop static analysis techniques to
                 construct these regions and demonstrate low overheads
                 and large region sizes for an LLVM-based
                 implementation. Across a set of diverse benchmark
                 suites, we construct idempotent regions close in size
                 to those that could be obtained with perfect runtime
                 information. Although the resulting code runs more
                 slowly, typical performance overheads are in the range
                 of just 2-12\%. The paradigm of executing entire
                 programs as a series of idempotent regions we call
                 idempotent processing, and it has many applications in
                 computer systems. As a concrete example, we demonstrate
                 it applied to the problem of compiler-automated
                 hardware fault recovery. In comparison to two other
                 state-of-the-art techniques, redundant execution and
                 checkpoint-logging, our idempotent processing technique
                 outperforms both by over 15\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Feng:2012:EPL,
  author =       "Min Feng and Rajiv Gupta and Iulian Neamtiu",
  title =        "Effective parallelization of loops in the presence of
                 {I/O} operations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "487--498",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254122",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Software-based thread-level parallelization has been
                 widely studied for exploiting data parallelism in
                 purely computational loops to improve program
                 performance on multiprocessors. However, none of the
                 previous efforts deal with efficient parallelization of
                 hybrid loops, i.e., loops that contain a mix of
                 computation and I/O operations. In this paper, we
                 propose a set of techniques for efficiently
                 parallelizing hybrid loops. Our techniques apply DOALL
                 parallelism to hybrid loops by breaking the
                 cross-iteration dependences caused by I/O operations.
                 We also support speculative execution of I/O operations
                 to enable speculative parallelization of hybrid loops.
                 Helper threading is used to reduce the I/O bus
                 contention caused by the improved parallelism. We
                 provide an easy-to-use programming model for exploiting
                 parallelism in loops with I/O operations. Parallelizing
                 hybrid loops using our model requires few modifications
                 to the code. We have developed a prototype
                 implementation of our programming model. We have
                 evaluated our implementation on a 24-core machine using
                 eight applications, including a widely-used genomic
                 sequence assembler and a multi-player game server, and
                 others from PARSEC and SPEC CPU2000 benchmark suites.
                 The hybrid loops in these applications take 23\%-99\%
                 of the total execution time on our 24-core machine. The
                 parallelized applications achieve speedups of
                 3.0x-12.8x with hybrid loop parallelization over the
                 sequential versions of the same applications. Compared
                 to the versions of applications where only computation
                 loops are parallelized, hybrid loop parallelization
                 improves the application performance by 68\% on
                 average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2012:PSR,
  author =       "Chun Chen",
  title =        "Polyhedra scanning revisited",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "499--508",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254123",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "This paper presents a new polyhedra scanning system
                 called CodeGen+ to address the challenge of generating
                 high-performance code for complex iteration spaces
                 resulting from compiler optimization and autotuning
                 systems. The strength of our approach lies in two new
                 algorithms. First, a loop overhead removal algorithm
                 provides precise control of trade-offs between loop
                 overhead and code size based on actual loop nesting
                 depth. Second, an if-statement simplification algorithm
                 further reduces the number of comparisons in the code.
                 These algorithms combined with the expressive power of
                 Presburger arithmetic enable CodeGen+ to support
                 complex optimization strategies expressed in iteration
                 spaces. We compare with the state-of-the-art polyhedra
                 scanning tool CLooG on five loop nest computations,
                 demonstrating that CodeGen+ generates code that is
                 simpler and up to 1.15x faster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Oancea:2012:LIT,
  author =       "Cosmin E. Oancea and Lawrence Rauchwerger",
  title =        "Logical inference techniques for loop
                 parallelization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "509--520",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254124",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "This paper presents a fully automatic approach to loop
                 parallelization that integrates the use of static and
                 run-time analysis and thus overcomes many known
                 difficulties such as nonlinear and indirect array
                 indexing and complex control flow. Our hybrid analysis
                 framework validates the parallelization transformation
                 by verifying the independence of the loop's memory
                 references. To this end it represents array references
                 using the USR (uniform set representation) language and
                 expresses the independence condition as an equation, S
                 =0, where S is a set expression representing array
                 indexes. Using a language instead of an
                 array-abstraction representation for S results in a
                 smaller number of conservative approximations but
                 exhibits a potentially-high runtime cost. To alleviate
                 this cost we introduce a language translation F from
                 the USR set-expression language to an equally rich
                 language of predicates ($F(S) \implies S = 0$). Loop
                 parallelization is then validated using a novel logic
                 inference algorithm that factorizes the obtained
                 complex predicates (F( S )) into a sequence of
                 sufficient independence conditions that are evaluated
                 first statically and, when needed, dynamically, in
                 increasing order of their estimated complexities. We
                 evaluate our automated solution on 26 benchmarks from
                 PERFECT-Club and SPEC suites and show that our approach
                 is effective in parallelizing large, complex loops and
                 obtains much better full program speedups than the
                 Intel and IBM Fortran compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pradel:2012:FAP,
  author =       "Michael Pradel and Thomas R. Gross",
  title =        "Fully automatic and precise detection of thread safety
                 violations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "521--530",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254126",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Concurrent, object-oriented programs often use
                 thread-safe library classes. Existing techniques for
                 testing a thread-safe class either rely on tests using
                 the class, on formal specifications, or on both.
                 Unfortunately, these techniques often are not fully
                 automatic as they involve the user in analyzing the
                 output. This paper presents an automatic testing
                 technique that reveals concurrency bugs in supposedly
                 thread-safe classes. The analysis requires as input
                 only the class under test and reports only true
                 positives. The key idea is to generate tests in which
                 multiple threads call methods on a shared instance of
                 the tested class. If a concurrent test exhibits an
                 exception or a deadlock that cannot be triggered in any
                 linearized execution of the test, the analysis reports
                 a thread safety violation. The approach is easily
                 applicable, because it is independent of hand-written
                 tests and explicit specifications. The analysis finds
                 15 concurrency bugs in popular Java libraries,
                 including two previously unknown bugs in the Java
                 standard library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Raman:2012:SPD,
  author =       "Raghavan Raman and Jisheng Zhao and Vivek Sarkar and
                 Martin Vechev and Eran Yahav",
  title =        "Scalable and precise dynamic datarace detection for
                 structured parallelism",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "531--542",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254127",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Existing dynamic race detectors suffer from at least
                 one of the following three limitations: (i) space
                 overhead per memory location grows linearly with the
                 number of parallel threads [13], severely limiting the
                 parallelism that the algorithm can handle; (ii)
                 sequentialization: the parallel program must be
                 processed in a sequential order, usually depth-first
                 [12, 24]. This prevents the analysis from scaling with
                 available hardware parallelism, inherently limiting its
                 performance; (iii) inefficiency: even though race
                 detectors with good theoretical complexity exist, they
                 do not admit efficient implementations and are
                 unsuitable for practical use [4, 18]. We present a new
                 precise dynamic race detector that leverages structured
                 parallelism in order to address these limitations. Our
                 algorithm requires constant space per memory location,
                 works in parallel, and is efficient in practice. We
                 implemented and evaluated our algorithm on a set of 15
                 benchmarks. Our experimental results indicate an
                 average (geometric mean) slowdown of 2.78x on a 16-core
                 SMP system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nagarakatte:2012:MAP,
  author =       "Santosh Nagarakatte and Sebastian Burckhardt and Milo
                 M. K. Martin and Madanlal Musuvathi",
  title =        "Multicore acceleration of priority-based schedulers
                 for concurrency bug detection",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "6",
  pages =        "543--554",
  month =        jun,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2345156.2254128",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 6 16:31:49 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PLDI '12 proceedings.",
  abstract =     "Testing multithreaded programs is difficult as threads
                 can interleave in a nondeterministic fashion. Untested
                 interleavings can cause failures, but testing all
                 interleavings is infeasible. Many interleaving
                 exploration strategies for bug detection have been
                 proposed, but their relative effectiveness and
                 performance remains unclear as they often lack publicly
                 available implementations and have not been evaluated
                 using common benchmarks. We describe NeedlePoint, an
                 open-source framework that allows selection and
                 comparison of a wide range of interleaving exploration
                 policies for bug detection proposed by prior work. Our
                 experience with NeedlePoint indicates that
                 priority-based probabilistic concurrency testing (the
                 PCT algorithm) finds bugs quickly, but it runs only one
                 thread at a time, which destroys parallelism by
                 serializing executions. To address this problem we
                 propose a parallel version of the PCT algorithm (PPCT).
                 We show that the new algorithm outperforms the original
                 by a factor of 5x when testing parallel programs on an
                 eight-core machine. We formally prove that parallel PCT
                 provides the same probabilistic coverage guarantees as
                 PCT. Moreover, PPCT is the first algorithm that runs
                 multiple threads while providing coverage guarantees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nieh:2012:CBR,
  author =       "Jason Nieh",
  title =        "Challenges in building a real, large private cloud",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "1--2",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151026",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Virtualization and internal cloud are often touted as
                 the solution to many challenging problems, from
                 resource underutilization to data-center optimization
                 and carbon emission reduction. However, the hidden
                 costs of cloud-scale virtualization, largely stemming
                 from the complex and difficult system administration
                 challenges it poses, are often overlooked. Reaping the
                 fruits of internal Infrastructure as a Service cloud
                 requires the enterprise to navigate scalability
                 limitations, revamp traditional operational practices,
                 manage performance, and achieve unprecedented
                 cross-silo collaboration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kundu:2012:MVA,
  author =       "Sajib Kundu and Raju Rangaswami and Ajay Gulati and
                 Ming Zhao and Kaushik Dutta",
  title =        "Modeling virtualized applications using machine
                 learning techniques",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "3--14",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151028",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "With the growing adoption of virtualized datacenters
                 and cloud hosting services, the allocation and sizing
                 of resources such as CPU, memory, and I/O bandwidth for
                 virtual machines (VMs) is becoming increasingly
                 important. Accurate performance modeling of an
                 application would help users in better VM sizing, thus
                 reducing costs. It can also benefit cloud service
                 providers who can offer a new charging model based on
                 the VMs' performance instead of their configured sizes.
                 In this paper, we present techniques to model the
                 performance of a VM-hosted application as a function of
                 the resources allocated to the VM and the resource
                 contention it experiences. To address this
                 multi-dimensional modeling problem, we propose and
                 refine the use of two machine learning techniques:
                 artificial neural network (ANN) and support vector
                 machine (SVM). We evaluate these modeling techniques
                 using five virtualized applications from the RUBiS and
                 Filebench suite of benchmarks and demonstrate that
                 their median and 90th percentile prediction errors are
                 within 4.36\% and 29.17\% respectively. These results
                 are substantially better than regression based
                 approaches as well as direct applications of machine
                 learning techniques without our refinements. We also
                 present a simple and effective approach to VM sizing
                 and empirically demonstrate that it can deliver optimal
                 results for 65\% of the sizing problems that we studied
                 and produces close-to-optimal sizes for the remaining
                 35\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lv:2012:VCV,
  author =       "Hui Lv and Yaozu Dong and Jiangang Duan and Kevin
                 Tian",
  title =        "Virtualization challenges: a view from server
                 consolidation perspective",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "15--26",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151030",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Server consolidation, by running multiple virtual
                 machines on top of a single platform with
                 virtualization, provides an efficient solution to
                 parallelism and utilization of modern multi-core
                 processors system. However, the performance and
                 scalability of server consolidation solution on modern
                 massive advanced server is not well addressed. In this
                 paper, we conduct a comprehensive study of Xen
                 performance and scalability characterization running
                 SPECvirt\_sc2010, and identify that large memory and
                 cache footprint, due to the unnecessary high frequent
                 context switch, introduce additional challenges to the
                 system performance and scalability. We propose two
                 optimizations (dynamically-allocable tasklets and
                 context-switch rate controller) to improve the
                 performance. The results show the improved memory and
                 cache efficiency with a reduction of the overall CPI,
                 resulting in an improvement of server consolidation
                 capability by 15\% in SPECvirt\_sc2010. In the
                 meantime, our optimization achieves an up to 50\%
                 acceleration of service response, which greatly
                 improves the QoS of Xen virtualization solution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wang:2012:RCV,
  author =       "Wei Wang and Tanima Dey and Ryan W. Moore and Mahmut
                 Aktasoglu and Bruce R. Childers and Jack W. Davidson
                 and Mary Jane Irwin and Mahmut Kandemir and Mary Lou
                 Soffa",
  title =        "{REEact}: a customizable virtual execution manager for
                 multicore platforms",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "27--38",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151031",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "With the shift to many-core chip multiprocessors
                 (CMPs), a critical issue is how to effectively
                 coordinate and manage the execution of applications and
                 hardware resources to overcome performance, power
                 consumption, and reliability challenges stemming from
                 hardware and application variations inherent in this
                 new computing environment. Effective resource and
                 application management on CMPs requires consideration
                 of user/application/hardware-specific requirements and
                 dynamic adaption of management decisions based on the
                 actual run-time environment. However, designing an
                 algorithm to manage resources and applications that can
                 dynamically adapt based on the run-time environment is
                 difficult because most resource and application
                 management and monitoring facilities are only available
                 at the operating system level. This paper presents
                 REEact, an infrastructure that provides the capability
                 to specify user-level management policies with dynamic
                 adaptation. REEact is a virtual execution environment
                 that provides a framework and core services to quickly
                 enable the design of custom management policies for
                 dynamically managing resources and applications. To
                 demonstrate the capabilities and usefulness of REEact,
                 this paper describes three case studies--each
                 illustrating the use of REEact to apply a specific
                 dynamic management policy on a real CMP. Through these
                 case studies, we demonstrate that REEact can
                 effectively and efficiently implement policies to
                 dynamically manage resources and adapt application
                 execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ma:2012:DTD,
  author =       "Zhiqiang Ma and Zhonghua Sheng and Lin Gu and Liufei
                 Wen and Gong Zhang",
  title =        "{DVM}: towards a datacenter-scale virtual machine",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "39--50",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151032",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "As cloud-based computation becomes increasingly
                 important, providing a general computational interface
                 to support datacenter-scale programming has become an
                 imperative research agenda. Many cloud systems use
                 existing virtual machine monitor (VMM) technologies,
                 such as Xen, VMware, and Windows Hypervisor, to
                 multiplex a physical host into multiple virtual hosts
                 and isolate computation on the shared cluster platform.
                 However, traditional multiplexing VMMs do not scale
                 beyond one single physical host, and it alone cannot
                 provide the programming interface and cluster-wide
                 computation that a datacenter system requires. We
                 design a new instruction set architecture, DISA, to
                 unify myriads of compute nodes to form a big virtual
                 machine called DVM, and present programmers the view of
                 a single computer where thousands of tasks run
                 concurrently in a large, unified, and snapshotted
                 memory space. The DVM provides a simple yet scalable
                 programming model and mitigates the scalability
                 bottleneck of traditional distributed shared memory
                 systems. Along with an efficient execution engine, the
                 capacity of a DVM can scale up to support large
                 clusters. We have implemented and tested DVM on three
                 platforms, and our evaluation shows that DVM has
                 excellent performance in terms of execution time and
                 speedup. On one physical host, the system overhead of
                 DVM is comparable to that of traditional VMMs. On 16
                 physical hosts, the DVM runs 10 times faster than
                 MapReduce/Hadoop and X10. On 256 EC2 instances, DVM
                 shows linear speedup on a parallelizable workload.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yu:2012:SCO,
  author =       "Tingting Yu and Witawas Srisa-an and Gregg Rothermel",
  title =        "{SimTester}: a controllable and observable testing
                 framework for embedded systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "51--62",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151034",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "In software for embedded systems, the frequent use of
                 interrupts for timing, sensing, and I/O processing can
                 cause concurrency faults to occur due to interactions
                 between applications, device drivers, and interrupt
                 handlers. This type of fault is considered by many
                 practitioners to be among the most difficult to detect,
                 isolate, and correct, in part because it can be
                 sensitive to execution interleavings and often occurs
                 without leaving any observable incorrect output. As
                 such, commonly used testing techniques that inspect
                 program outputs to detect failures are often
                 ineffective at detecting them. To test for these
                 concurrency faults, test engineers need to be able to
                 control interleavings so that they are deterministic.
                 Furthermore, they also need to be able to observe
                 faults as they occur instead of relying on observable
                 incorrect outputs. In this paper, we introduce
                 SimTester, a framework that allows engineers to
                 effectively test for subtle and non-deterministic
                 concurrency faults by providing them with greater
                 controllability and observability. We implemented our
                 framework on a commercial virtual platform that is
                 widely used to support hardware/software co-designs to
                 promote ease of adoption. We then evaluated its
                 effectiveness by using it to test for data races and
                 deadlocks. The result shows that our framework can be
                 effective and efficient at detecting these faults.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhang:2012:SRB,
  author =       "Yuan Zhang and Min Yang and Bo Zhou and Zhemin Yang
                 and Weihua Zhang and Binyu Zang",
  title =        "{Swift}: a register-based {JIT} compiler for embedded
                 {JVMs}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "63--74",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151035",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Code quality and compilation speed are two challenges
                 to JIT compilers, while selective compilation is
                 commonly used to trade-off these two issues. Meanwhile,
                 with more and more Java applications running in mobile
                 devices, selective compilation meets many problems.
                 Since these applications always have flat execution
                 profile and short live time, a lightweight JIT
                 technique without losing code quality is extremely
                 needed. However, the overhead of compiling stack-based
                 Java bytecode to heterogeneous register-based machine
                 code is significant in embedded devices. This paper
                 presents a fast and effective JIT technique for mobile
                 devices, building on a register-based Java bytecode
                 format which is more similar to the underlying machine
                 architecture. Through a comprehensive study on the
                 characteristics of Java applications, we observe that
                 virtual registers used by more than 90\% Java methods
                 can be directly fulfilled by 11 physical registers.
                 Based on this observation, this paper proposes Swift, a
                 novel JIT compiler on register-based bytecode, which
                 generates native code for RISC machines. After mapping
                 virtual registers to physical registers, the code is
                 generated efficiently by looking up a translation
                 table. And the code quality is guaranteed by the static
                 compiler which is used to generate register-based
                 bytecode. Besides, we design two lightweight
                 optimizations and an efficient code unloader to make
                 Swift more suitable for embedded environment. As the
                 prevalence of Android, a prototype of Swift is
                 implemented upon DEX bytecode which is the official
                 distribution format of Android applications. Swift is
                 evaluated with three benchmarks (SPECjvm98,
                 EmbeddedCaffeineMark3 and JemBench2) on two different
                 ARM SOCs: S3C6410 (armv6) and OMAP3530 (armv7). The
                 results show that Swift achieves a speedup of 3.13 over
                 the best-performing interpreter on the selected
                 benchmarks. Compared with the state-of-the-art JIT
                 compiler in Android, JITC-Droid, Swift achieves a
                 speedup of 1.42.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shan:2012:FIA,
  author =       "Zhiyong Shan and Xin Wang and Tzi-cker Chiueh and
                 Xiaofeng Meng",
  title =        "Facilitating inter-application interactions for
                 {OS}-level virtualization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "75--86",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151036",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "OS-level virtualization generates a minimal start-up
                 and run-time overhead on the host OS and thus suits
                 applications that require both good isolation and high
                 efficiency. However, multiple-member applications
                 required for forming a system may need to occasionally
                 communicate across this isolation barrier to cooperate
                 with each other while they are separated in different
                 VMs to isolate intrusion or fault. Such application
                 scenarios are often critical to enterprise-class
                 servers, HPC clusters and intrusion/fault-tolerant
                 systems, etc. We make the first effort to support the
                 inter-application interactions in an OS-level
                 virtualization system without causing a significant
                 compromise on VM isolation. We identify all interactive
                 operations that impact inter-application interactions,
                 including inter-process communications, application
                 invocations, resource name transfers and application
                 dependencies. We propose Shuttle, a novel approach for
                 facilitating inter-application interactions within and
                 across OS-level virtual machines. Our results
                 demonstrate that Shuttle can correctly address all
                 necessary inter-application interactions while
                 providing good isolation capability to all sample
                 applications on different versions of Windows OS.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gerofi:2012:ETT,
  author =       "Balazs Gerofi and Yutaka Ishikawa",
  title =        "Enhancing {TCP} throughput of highly available virtual
                 machines via speculative communication",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "87--96",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151038",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Checkpoint-recovery based virtual machine (VM)
                 replication is an attractive technique for
                 accommodating VM installations with high-availability.
                 It provides seamless failover for the entire software
                 stack executed in the VM regardless the application or
                 the underlying operating system (OS), it runs on
                 commodity hardware, and it is inherently capable of
                 dealing with shared memory non-determinism of symmetric
                 multiprocessing (SMP) configurations. There have been
                 several studies aiming at alleviating the overhead of
                 replication, however, due to consistency requirements,
                 network performance of the basic replication mechanism
                 remains extremely poor., In this paper we revisit the
                 replication protocol and extend it with speculative
                 communication. Speculative communication silently
                 acknowledges TCP packets of the VM, enabling the
                 guest's TCP stack to progress with transmission without
                 exposing the messages to the clients before the
                 corresponding execution state is checkpointed to the
                 backup host. Furthermore, we propose replication aware
                 congestion control, an extension to the guest's TCP
                 stack that aggressively fills up the VMM's replication
                 buffer so that speculative packets can be backed up and
                 released earlier to the clients. We observe up to an
                 order of magnitude improvement in bulk data transfer
                 with speculative communication, and close to native VM
                 network performance when replication awareness is
                 enabled in the guest OS. We provide results of micro-,
                 as well as application-level benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Rajagopalan:2012:SDT,
  author =       "Shriram Rajagopalan and Brendan Cully and Ryan
                 O'Connor and Andrew Warfield",
  title =        "{SecondSite}: disaster tolerance as a service",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "97--108",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151039",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "This paper describes the design and implementation of
                 SecondSite, a cloud-based service for disaster
                 tolerance. SecondSite extends the Remus
                 virtualization-based high availability system by
                 allowing groups of virtual machines to be replicated
                 across data centers over wide-area Internet links. The
                 goal of the system is to commodify the property of
                 availability, exposing it as a simple tick box when
                 configuring a new virtual machine. To achieve this in
                 the wide area, we have had to tackle the related issues
                 of replication traffic bandwidth, reliable failure
                 detection across geographic regions and traffic
                 redirection over a wide-area network without
                 compromising on transparency and consistency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pan:2012:CLM,
  author =       "Zhenhao Pan and Yaozu Dong and Yu Chen and Lei Zhang
                 and Zhijiao Zhang",
  title =        "{CompSC}: live migration with pass-through devices",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "109--120",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151040",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Live migration is one of the most important features
                 of virtualization technology. With regard to recent
                 virtualization techniques, performance of network I/O
                 is critical. Current network I/O virtualization (e.g.
                 Para-virtualized I/O, VMDq) has a significant
                 performance gap with native network I/O. Pass-through
                 network devices have near native performance, however,
                 they have thus far prevented live migration. No
                 existing methods solve the problem of live migration
                 with pass-through devices perfectly. In this paper, we
                 propose CompSC: a solution of hardware state migration
                 that will enable the live migration support of
                 pass-through devices. We go on to apply CompSC to
                 SR-IOV network interface controllers. We discuss the
                 attributes of different hardware states in pass-through
                 devices and migrate them with corresponding techniques.
                 Our experiments show that CompSC enables live migration
                 on an Intel 82599 VF with a throughput 282.66\% higher
                 than para-virtualized devices. In addition, service
                 downtime during live migration is 42.9\% less than
                 para-virtualized devices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kemerlis:2012:LPD,
  author =       "Vasileios P. Kemerlis and Georgios Portokalidis and
                 Kangkook Jee and Angelos D. Keromytis",
  title =        "{{\tt libdft}}: practical dynamic data flow tracking
                 for commodity systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "121--132",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151042",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Dynamic data flow tracking (DFT) deals with tagging
                 and tracking data of interest as they propagate during
                 program execution. DFT has been repeatedly implemented
                 by a variety of tools for numerous purposes, including
                 protection from zero-day and cross-site scripting
                 attacks, detection and prevention of information leaks,
                 and for the analysis of legitimate and malicious
                 software. We present {\tt libdft}, a dynamic DFT
                 framework that unlike previous work is at once fast,
                 reusable, and works with commodity software and
                 hardware. {\tt libdft} provides an API for building
                 DFT-enabled tools that work on unmodified binaries,
                 running on common operating systems and hardware, thus
                 facilitating research and rapid prototyping. We explore
                 different approaches for implementing the low-level
                 aspects of instruction-level data tracking, introduce a
                 more efficient and 64-bit capable shadow memory, and
                 identify (and avoid) the common pitfalls responsible
                 for the excessive performance overhead of previous
                 studies. We evaluate {\tt libdft} using real
                 applications with large codebases like the Apache and
                 MySQL servers, and the Firefox web browser. We also use
                 a series of benchmarks and utilities to compare {\tt
                 libdft} with similar systems. Our results indicate that
                 it performs at least as fast, if not faster, than
                 previous solutions, and to the best of our knowledge,
                 we are the first to evaluate the performance overhead
                 of a fast dynamic DFT implementation in such depth.
                 Finally, {\tt libdft} is freely available as open
                 source software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bruening:2012:TDI,
  author =       "Derek Bruening and Qin Zhao and Saman Amarasinghe",
  title =        "Transparent dynamic instrumentation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "133--144",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151043",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Process virtualization provides a virtual execution
                 environment within which an unmodified application can
                 be monitored and controlled while it executes. The
                 provided layer of control can be used for purposes
                 ranging from sandboxing to compatibility to profiling.
                 The additional operations required for this layer are
                 performed clandestinely alongside regular program
                 execution. Software dynamic instrumentation is one
                 method for implementing process virtualization which
                 dynamically instruments an application such that the
                 application's code and the inserted code are
                 interleaved together. DynamoRIO is a process
                 virtualization system implemented using software code
                 cache techniques that allows users to build customized
                 dynamic instrumentation tools. There are many
                 challenges to building such a runtime system. One major
                 obstacle is transparency. In order to support executing
                 arbitrary applications, DynamoRIO must be fully
                 transparent so that an application cannot distinguish
                 between running inside the virtual environment and
                 native execution. In addition, any desired extra
                 operations for a particular tool must avoid interfering
                 with the behavior of the application. Transparency has
                 historically been provided on an ad-hoc basis, as a
                 reaction to observed problems in target applications.
                 This paper identifies a necessary set of transparency
                 requirements for running mainstream Windows and Linux
                 applications. We discuss possible solutions to each
                 transparency issue, evaluate tradeoffs between
                 different choices, and identify cases where maintaining
                 transparency is not practically solvable. We believe
                 this will provide a guideline for better design and
                 implementation of transparent dynamic instrumentation,
                 as well as other similar process virtualization systems
                 using software code caches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lefebvre:2012:EM,
  author =       "Geoffrey Lefebvre and Brendan Cully and Christopher
                 Head and Mark Spear and Norm Hutchinson and Mike Feeley
                 and Andrew Warfield",
  title =        "Execution mining",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "145--158",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151044",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Operating systems represent large pieces of complex
                 software that are carefully tested and broadly
                 deployed. Despite this, developers frequently have
                 little more than their source code to understand how
                 they behave. This static representation of a system
                 results in limited insight into execution dynamics,
                 such as what code is important, how data flows through
                 a system, or how threads interact with one another. We
                 describe Tralfamadore, a system that preserves complete
                 traces of machine execution as an artifact that can be
                 queried and analyzed with a library of simple, reusable
                 operators, making it easy to develop and run new
                 dynamic analyses. We demonstrate the benefits of this
                 approach with several example applications, including a
                 novel unified source and execution browser.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pavlou:2012:DBD,
  author =       "Demos Pavlou and Enric Gibert and Fernando Latorre and
                 Antonio Gonzalez",
  title =        "{DDGacc}: boosting dynamic {DDG}-based binary
                 optimizations through specialized hardware support",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "159--168",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151046",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Dynamic Binary Translators (DBT) and Dynamic Binary
                 Optimization (DBO) by software are used widely for
                 several reasons including performance, design
                 simplification and virtualization. However, the
                 software layer in such systems introduces
                 non-negligible overheads which affect performance and
                 user experience. Hence, reducing DBT/DBO overheads is
                 of paramount importance. In addition, reduced overheads
                 have interesting collateral effects in the rest of the
                 software layer, such as allowing optimizations to be
                 applied earlier. A cost-effective solution to this
                 problem is to provide hardware support to speed up the
                 primitives of the software layer, paying special
                 attention to automate DBT/DBO mechanisms and leave the
                 heuristics to the software, which is more flexible. In
                 this work, we have characterized the overheads of a DBO
                 system using DynamoRIO implementing several basic
                 optimizations. We have seen that the computation of the
                 Data Dependence Graph (DDG) accounts for 5\%-10\% of
                 the execution time. For this reason, we propose to add
                 hardware support for this task in the form of a new
                 functional unit, called DDGacc, which is integrated in
                 a conventional pipeline processor and is operated
                 through new ISA instructions. Our evaluation shows that
                 DDGacc reduces the cost of computing the DDG by 32x,
                 which reduces overall execution time by 5\%-10\% on
                 average and up to 18\% for applications where the DBO
                 optimizes large code footprints.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ishizaki:2012:ADT,
  author =       "Kazuaki Ishizaki and Takeshi Ogasawara and Jose
                 Castanos and Priya Nagpurkar and David Edelsohn and
                 Toshio Nakatani",
  title =        "Adding dynamically-typed language support to a
                 statically-typed language compiler: performance
                 evaluation, analysis, and tradeoffs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "169--180",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151047",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Applications written in dynamically typed scripting
                 languages are increasingly popular for Web software
                 development. Even on the server side, programmers are
                 using dynamically typed scripting languages such as
                 Ruby and Python to build complex applications quickly.
                 As the number and complexity of dynamically typed
                 scripting language applications grows, optimizing their
                 performance is becoming important. Some of the best
                 performing compilers and optimizers for dynamically
                 typed scripting languages are developed entirely from
                 scratch and target a specific language. This approach
                 is not scalable, given the variety of dynamically typed
                 scripting languages, and the effort involved in
                 developing and maintaining separate infrastructures for
                 each. In this paper, we evaluate the feasibility of
                 adapting and extending an existing production-quality
                 method-based Just-In-Time (JIT) compiler for a language
                 with dynamic types. Our goal is to identify the
                 challenges and shortcomings with the current
                 infrastructure, and to propose and evaluate runtime
                 techniques and optimizations that can be incorporated
                 into a common optimization infrastructure for static
                 and dynamic languages. We discuss three extensions to
                 the compiler to support dynamically typed languages:
                 (1) simplification of control flow graphs, (2) mapping
                 of memory locations to stack-allocated variables, and
                 (3) reduction of runtime overhead using language
                 semantics. We also propose four new optimizations for
                 Python in (2) and (3). These extensions are effective
                 in reduction of compiler working memory and improvement
                 of runtime performance. We present a detailed
                 performance evaluation of our approach for Python,
                 finding an overall improvement of 1.69x on average (up
                 to 2.74x) over our JIT compiler without any
                 optimization for dynamically typed languages and
                 Python.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lin:2012:UKT,
  author =       "Yi Lin and Stephen M. Blackburn and Daniel Frampton",
  title =        "Unpicking the knot: teasing apart {VM}\slash
                 application interdependencies",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "181--190",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151048",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Flexible and efficient runtime design requires an
                 understanding of the dependencies among the components
                 internal to the runtime and those between the
                 application and the runtime. These dependencies are
                 frequently unclear. This problem exists in all runtime
                 design, and is most vivid in a metacircular runtime ---
                 one that is implemented in terms of itself.
                 Metacircularity blurs boundaries between application
                 and runtime implementation, making it harder to
                 understand and make guarantees about overall system
                 behavior, affecting isolation, security, and resource
                 management, as well as reducing opportunities for
                 optimization. Our goal is to shed new light on VM
                 interdependencies, helping all VM designers understand
                 these dependencies and thereby engineer better
                 runtimes. We explore these issues in the context of a
                 high-performance Java-in-Java virtual machine. Our
                 approach is to identify and instrument transition
                 points into and within the runtime, which allows us to
                 establish a dynamic execution context. Our
                 contributions are: (1) implementing and measuring a
                 system that dynamically maintains execution context
                 with very low overhead, (2) demonstrating that such a
                 framework can be used to improve the software
                 engineering of an existing runtime, and (3) analyzing
                 the behavior and runtime characteristics of our runtime
                 across a wide range of benchmarks. Our solution
                 provides clarity about execution state and allowable
                 transitions, making it easier to develop, debug, and
                 understand managed runtimes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tuch:2012:BSV,
  author =       "Harvey Tuch and Cyprien Laplace and Kenneth C. Barr
                 and Bi Wu",
  title =        "Block storage virtualization with commodity secure
                 digital cards",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "191--202",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151050",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Smartphones, tablets and other mobile platforms
                 typically accommodate bulk data storage with low-cost,
                 FAT-formatted Secure Digital cards. When one uses a
                 mobile device to run a full-system virtual machine
                 (VM), there can be a mismatch between (1) the VM's I/O
                 mixture, security and reliability requirements and (2)
                 the properties of the storage media available for VM
                 block storage and checkpoint images. To resolve this
                 mismatch, this paper presents a new VM disk image
                 format called the Logging Block Store (LBS). After
                 motivating the need for a new format, LBS is described
                 in detail with experimental results demonstrating its
                 efficacy. As a result of this work, recommendations are
                 made for future optimizations throughout the stack that
                 may simplify and improve the performance of storage
                 virtualization systems on mobile platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ghosh:2012:RAA,
  author =       "Sudeep Ghosh and Jason Hiser and Jack W. Davidson",
  title =        "Replacement attacks against {VM}-protected
                 applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "203--214",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151051",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Process-level virtualization is increasingly being
                 used to enhance the security of software applications
                 from reverse engineering and unauthorized modification
                 (called software protection). Process-level virtual
                 machines (PVMs) can safeguard the application code at
                 run time and hamper the adversary's ability to launch
                 dynamic attacks on the application. This dynamic
                 protection, combined with its flexibility, ease in
                 handling legacy systems and low performance overhead,
                 has made process-level virtualization a popular
                 approach for providing software protection. While there
                 has been much research on using process-level
                 virtualization to provide such protection, there has
                 been less research on attacks against PVM-protected
                 software. In this paper, we describe an attack on
                 applications protected using process-level
                 virtualization, called a replacement attack. In a
                 replacement attack, the adversary replaces the
                 protecting PVM with an attack VM thereby rendering the
                 application vulnerable to analysis and modification. We
                 present a general description of the replacement attack
                 methodology and two attack implementations against a
                 protected application using freely available tools. The
                 generality and simplicity of replacement attacks
                 demonstrates that there is a strong need to develop
                 techniques that meld applications more tightly to the
                 protecting PVM to prevent such attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Payer:2012:PAA,
  author =       "Mathias Payer and Thomas R. Gross",
  title =        "Protecting applications against {TOCTTOU} races by
                 user-space caching of file metadata",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "215--226",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151052",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "Time Of Check To Time Of Use (TOCTTOU) race conditions
                 for file accesses in user-space applications are a
                 common problem in Unix-like systems. The mapping
                 between filename and inode and device is volatile and
                 can provide the necessary preconditions for an exploit.
                 Applications use filenames as the primary attribute to
                 identify files but the mapping between filenames and
                 inode and device can be changed by an attacker.
                 DynaRace is an approach that protects unmodified
                 applications from file-based TOCTTOU race conditions.
                 DynaRace uses a transparent mapping cache that keeps
                 additional state and metadata for each accessed file in
                 the application. The combination of file state and the
                 current system call type are used to decide if (i) the
                 metadata is updated or (ii) the correctness of the
                 metadata is enforced between consecutive system calls.
                 DynaRace uses user-mode path resolution internally to
                 resolve individual file atoms. Each file atom is
                 verified or updated according to the associated state
                 in the mapping cache. More specifically, DynaRace
                 protects against race conditions for all file-based
                 system calls, by replacing the unsafe system calls with
                 a set of safe system calls that utilize the mapping
                 cache. The system call is executed only if the state
                 transition is allowed and the information in the
                 mapping cache matches. DynaRace deterministically
                 solves the problem of file-based race conditions for
                 unmodified applications and removes an attacker's
                 ability to exploit the TOCTTOU race condition. DynaRace
                 detects injected alternate inode and device pairs and
                 terminates the application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yan:2012:VCH,
  author =       "Lok-Kwong Yan and Manjukumar Jayachandra and Mu Zhang
                 and Heng Yin",
  title =        "{V2E}: combining hardware virtualization and software
                 emulation for transparent and extensible malware
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "7",
  pages =        "227--238",
  month =        jul,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2365864.2151053",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Sep 6 10:01:03 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "VEE '12 conference proceedings.",
  abstract =     "A transparent and extensible malware analysis platform
                 is essential for defeating malware. This platform
                 should be transparent so malware cannot easily detect
                 and bypass it. It should also be extensible to provide
                 strong support for heavyweight instrumentation and
                 analysis efficiency. However, no existing platform can
                 meet both requirements. Leveraging hardware
                 virtualization technology, analysis platforms like
                 Ether can achieve good transparency, but its
                 instrumentation support and analysis efficiency is
                 poor. In contrast, software emulation provides strong
                 support for code instrumentation and good analysis
                 efficiency by using dynamic binary translation.
                 However, analysis platforms based on software emulation
                 can be easily detected by malware and thus is poor in
                 transparency. To achieve both transparency and
                 extensibility, we propose a new analysis platform that
                 combines hardware virtualization and software
                 emulation. The essence is precise heterogeneous replay:
                 the malware execution is recorded via hardware
                 virtualization and then replayed in software. Our
                 design ensures the execution replay is precise.
                 Moreover, with page-level recording granularity, the
                 platform can easily adjust to analyze various forms of
                 malware (a process, a kernel module, or a shared
                 library). We implemented a prototype called V2E and
                 demonstrated its capability and efficiency by
                 conducting an extensive evaluation with both synthetic
                 samples and 14 realworld emulation-resistant malware
                 samples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Huynh:2012:SFM,
  author =       "Huynh Phung Huynh and Andrei Hagiescu and Weng-Fai
                 Wong and Rick Siow Mong Goh",
  title =        "Scalable framework for mapping streaming applications
                 onto multi-{GPU} systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "1--10",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145818",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Graphics processing units leverage on a large array of
                 parallel processing cores to boost the performance of a
                 specific streaming computation pattern frequently found
                 in graphics applications. Unfortunately, while many
                 other general purpose applications do exhibit the
                 required streaming behavior, they also possess
                 unfavorable data layout and poor
                 computation-to-communication ratios that penalize any
                 straight-forward execution on the GPU. In this paper we
                 describe an efficient and scalable code generation
                 framework that can map general purpose streaming
                 applications onto a multi-GPU system. This framework
                 spans the entire core and memory hierarchy exposed by
                 the multi-GPU system. Several key features in our
                 framework ensure the scalability required by complex
                 streaming applications. First, we propose an efficient
                 stream graph partitioning algorithm that partitions the
                 complex application to achieve the best performance
                 under a given shared memory constraint. Next, the
                 resulting partitions are mapped to multiple GPUs using
                 an efficient architecture-driven strategy. The mapping
                 balances the workload while considering the
                 communication overhead. Finally, a highly effective
                 pipeline execution is employed for the execution of the
                 partitions on the multi-GPU system. The framework has
                 been implemented as a back-end of the StreamIt
                 programming language compiler. Our comprehensive
                 experiments show its scalability and significant
                 performance speedup compared with a previous
                 state-of-the-art solution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sim:2012:PAF,
  author =       "Jaewoong Sim and Aniruddha Dasgupta and Hyesoon Kim
                 and Richard Vuduc",
  title =        "A performance analysis framework for identifying
                 potential benefits in {GPGPU} applications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "11--22",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145819",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Tuning code for GPGPU and other emerging many-core
                 platforms is a challenge because few models or tools
                 can precisely pinpoint the root cause of performance
                 bottlenecks. In this paper, we present a performance
                 analysis framework that can help shed light on such
                 bottlenecks for GPGPU applications. Although a handful
                 of GPGPU profiling tools exist, most of the traditional
                 tools, unfortunately, simply provide programmers with a
                 variety of measurements and metrics obtained by running
                 applications, and it is often difficult to map these
                 metrics to understand the root causes of slowdowns,
                 much less decide what next optimization step to take to
                 alleviate the bottleneck. In our approach, we first
                 develop an analytical performance model that can
                 precisely predict performance and aims to provide
                 programmer-interpretable metrics. Then, we apply static
                 and dynamic profiling to instantiate our performance
                 model for a particular input code and show how the
                 model can predict the potential performance benefits.
                 We demonstrate our framework on a suite of
                 micro-benchmarks as well as a variety of computations
                 extracted from real codes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Baghsorkhi:2012:EPE,
  author =       "Sara S. Baghsorkhi and Isaac Gelado and Matthieu
                 Delahaye and Wen-mei W. Hwu",
  title =        "Efficient performance evaluation of memory hierarchy
                 for highly multithreaded graphics processors",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "23--34",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145820",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "With the emergence of highly multithreaded
                 architectures, performance monitoring techniques face
                 new challenges in efficiently locating sources of
                 performance discrepancies in the program source code.
                 For example, the state-of-the-art performance counters
                 in highly multithreaded graphics processing units
                 (GPUs) report only the overall occurrences of
                 microarchitecture events at the end of program
                 execution. Furthermore, even if supported, any
                 fine-grained sampling of performance counters will
                 distort the actual program behavior and will make the
                 sampled values inaccurate. On the other hand, it is
                 difficult to achieve high resolution performance
                 information at low sampling rates in the presence of
                 thousands of concurrently running threads. In this
                 paper, we present a novel software-based approach for
                 monitoring the memory hierarchy performance in highly
                 multithreaded general-purpose graphics processors. The
                 proposed analysis is based on memory traces collected
                 for snapshots of an application execution. A
                 trace-based memory hierarchy model with a Monte Carlo
                 experimental methodology generates statistical bounds
                 of performance measures without being concerned about
                 the exact inter-thread ordering of individual events
                 but rather studying the behavior of the overall system.
                 The statistical approach overcomes the classical
                 problem of disturbed execution timing due to
                 fine-grained instrumentation. The approach scales well
                 as we deploy an efficient parallel trace collection
                 technique to reduce the trace generation overhead and a
                 simple memory hierarchy model to reduce the simulation
                 time. The proposed scheme also keeps track of
                 individual memory operations in the source code and can
                 quantify their efficiency with respect to the memory
                 system. A cross-validation of our results shows close
                 agreement with the values read from the hardware
                 performance counters on an NVIDIA Tesla C2050 GPU.
                 Based on the high resolution profile data produced by
                 our model we optimized memory accesses in the sparse
                 matrix vector multiply kernel and achieved speedups
                 ranging from 2.4 to 14.8 depending on the
                 characteristics of the input matrices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ballard:2012:CAS,
  author =       "Grey Ballard and James Demmel and Nicholas Knight",
  title =        "Communication avoiding successive band reduction",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "35--44",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145822",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The running time of an algorithm depends on both
                 arithmetic and communication (i.e., data movement)
                 costs, and the relative costs of communication are
                 growing over time. In this work, we present both
                 theoretical and practical results for tridiagonalizing
                 a symmetric band matrix: we present an algorithm that
                 asymptotically reduces communication, and we show that
                 it indeed performs well in practice. The
                 tridiagonalization of a symmetric band matrix is a key
                 kernel in solving the symmetric eigenvalue problem for
                 both full and band matrices. In order to preserve
                 sparsity, tridiagonalization routines use
                 annihilate-and-chase procedures that previously have
                 suffered from poor data locality. We improve data
                 locality by reorganizing the computation,
                 asymptotically reducing communication costs compared to
                 existing algorithms. Our sequential implementation
                 demonstrates that avoiding communication improves
                 runtime even at the expense of extra arithmetic: we
                 observe a 2x speedup over Intel MKL while doing 43\%
                 more floating point operations. Our parallel
                 implementation targets shared-memory multicore
                 platforms. It uses pipelined parallelism and a static
                 scheduler while retaining the locality properties of
                 the sequential algorithm. Due to lightweight
                 synchronization and effective data reuse, we see 9.5x
                 scaling over our serial code and up to 6x speedup over
                 the PLASMA library, comparing parallel performance on a
                 ten-core processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sack:2012:FTA,
  author =       "Paul Sack and William Gropp",
  title =        "Faster topology-aware collective algorithms through
                 non-minimal communication",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "45--54",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145823",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Known algorithms for two important collective
                 communication operations, allgather and reduce-scatter,
                 are minimal-communication algorithms; no process sends
                 or receives more than the minimum amount of data. This,
                 combined with the data-ordering semantics of the
                 operations, limits the flexibility and performance of
                 these algorithms. Our novel non-minimal, topology-aware
                 algorithms deliver far better performance with the
                 addition of a very small amount of redundant
                 communication. We develop novel algorithms for Clos
                 networks and single or multi-ported torus networks.
                 Tests on a 32k-node BlueGene/P result in allgather
                 speedups of up to 6x and reduce-scatter speedups of
                 over 11x compared to the native IBM algorithm.
                 Broadcast, reduce, and allreduce can be composed of
                 allgather or reduce-scatter and other collective
                 operations; our techniques also improve the performance
                 of these algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kim:2012:ESC,
  author =       "Seonggun Kim and Hwansoo Han",
  title =        "Efficient {SIMD} code generation for irregular
                 kernels",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "55--64",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145824",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Array indirection causes several challenges for
                 compilers to utilize single instruction, multiple data
                 (SIMD) instructions. Disjoint memory references,
                 arbitrarily misaligned memory references, and
                 dependence cycles in loops are main challenges to
                 handle for SIMD compilers. Due to those challenges,
                 existing SIMD compilers have excluded loops with array
                 indirection from their candidate loops for SIMD
                 vectorization. However, addressing those challenges is
                 inevitable, since many important compute-intensive
                 applications extensively use array indirection to
                 reduce memory and computation requirements. In this
                 work, we propose a method to generate efficient SIMD
                 code for loops containing indirected memory references.
                 We extract both inter- and intra-iteration parallelism,
                 taking data reorganization overhead into consideration.
                 We also optimally place data reorganization code in
                 order to amortize the reorganization overhead through
                 the performance gain of SIMD vectorization. Experiments
                 on four array indirection kernels, which are extracted
                 from real-world scientific applications, show that our
                 proposed method effectively generates SIMD code for
                 irregular kernels with array indirection. Compared to
                 the existing SIMD vectorization methods, our proposed
                 method significantly improves the performance of
                 irregular kernels by 91\%, on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Leissa:2012:ECL,
  author =       "Roland Lei{\ss}a and Sebastian Hack and Ingo Wald",
  title =        "Extending a {C}-like language for portable {SIMD}
                 programming",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "65--74",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145825",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "SIMD instructions are common in CPUs for years now.
                 Using these instructions effectively requires not only
                 vectorization of code, but also modifications to the
                 data layout. However, automatic vectorization
                 techniques are often not powerful enough and suffer
                 from restricted scope of applicability; hence,
                 programmers often vectorize their programs manually by
                 using intrinsics: compiler-known functions that
                 directly expand to machine instructions. They
                 significantly decrease programmer productivity by
                 enforcing a very error-prone and hard-to-read
                 assembly-like programming style. Furthermore,
                 intrinsics are not portable because they are tied to a
                 specific instruction set. In this paper, we show how a
                 C-like language can be extended to allow for portable
                 and efficient SIMD programming. Our extension puts the
                 programmer in total control over where and how
                 control-flow vectorization is triggered. We present a
                 type system and a formal semantics of our extension and
                 prove the soundness of the type system. Using our
                 prototype implementation IVL that targets Intel's MIC
                 architecture and SSE instruction set, we show that the
                 generated code is roughly on par with handwritten
                 intrinsic code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kwon:2012:HAO,
  author =       "Okwan Kwon and Fahed Jubair and Rudolf Eigenmann and
                 Samuel Midkiff",
  title =        "A hybrid approach of {OpenMP} for clusters",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "75--84",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145827",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "We present the first fully automated compiler-runtime
                 system that successfully translates and executes OpenMP
                 shared-address-space programs on laboratory-size
                 clusters, for the complete set of regular, repetitive
                 applications in the NAS Parallel Benchmarks. We
                 introduce a hybrid compiler-runtime translation scheme.
                 Compared to previous work, this scheme features a new
                 runtime data flow analysis and new compiler techniques
                 for improving data affinity and reducing communication
                 costs. We present and discuss the performance of our
                 translated programs, and compare them with the
                 performance of the MPI, HPF and UPC versions of the
                 benchmarks. The results show that our translated
                 programs achieve 75\% of the hand-coded MPI programs,
                 on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{hunEom:2012:DDP,
  author =       "Yong hun Eom and Stephen Yang and James C. Jenista and
                 Brian Demsky",
  title =        "{DOJ}: dynamically parallelizing object-oriented
                 programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "85--96",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145828",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "We present Dynamic Out-of-Order Java (DOJ), a dynamic
                 parallelization approach. In DOJ, a developer annotates
                 code blocks as tasks to decouple these blocks from the
                 parent execution thread. The DOJ compiler then analyzes
                 the code to generate heap examiners that ensure the
                 parallel execution preserves the behavior of the
                 original sequential program. Heap examiners dynamically
                 extract heap dependences between code blocks and
                 determine when it is safe to execute a code block. We
                 have implemented DOJ and evaluated it on twelve
                 benchmarks. We achieved an average compilation speedup
                 of 31.15 times over OoOJava and an average execution
                 speedup of 12.73 times over sequential versions of the
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bonetta:2012:SLH,
  author =       "Daniele Bonetta and Achille Peternier and Cesare
                 Pautasso and Walter Binder",
  title =        "{S}: a scripting language for high-performance
                 {RESTful} {Web} services",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "97--106",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145829",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "There is an urgent need for novel programming
                 abstractions to leverage the parallelism in modern
                 multicore machines. We introduce S, a new
                 domain-specific language targeting the server-side
                 scripting of high-performance RESTful Web services. S
                 promotes an innovative programming model based on
                 explicit (control-flow) and implicit (process-level)
                 parallelism control, allowing the service developer to
                 specify which portions of the control-flow should be
                 executed in parallel. For each service, the choice of
                 the best level of parallelism is left to the runtime
                 system. We assess performance and scalability by
                 implementing two non-trivial composite Web services in
                 S. Experiments show that S-based Web services can
                 handle thousands of concurrent client requests on a
                 modern multicore machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mendez-Lojo:2012:GII,
  author =       "Mario Mendez-Lojo and Martin Burtscher and Keshav
                 Pingali",
  title =        "A {GPU} implementation of inclusion-based points-to
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "107--116",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145831",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Graphics Processing Units (GPUs) have emerged as
                 powerful accelerators for many regular algorithms that
                 operate on dense arrays and matrices. In contrast, we
                 know relatively little about using GPUs to accelerate
                 highly irregular algorithms that operate on
                 pointer-based data structures such as graphs. For the
                 most part, research has focused on GPU implementations
                 of graph analysis algorithms that do not modify the
                 structure of the graph, such as algorithms for
                 breadth-first search and strongly-connected components.
                 In this paper, we describe a high-performance GPU
                 implementation of an important graph algorithm used in
                 compilers such as gcc and LLVM: Andersen-style
                 inclusion-based points-to analysis. This algorithm is
                 challenging to parallelize effectively on GPUs because
                 it makes extensive modifications to the structure of
                 the underlying graph and performs relatively little
                 computation. In spite of this, our program, when
                 executed on a 14 Streaming Multiprocessor GPU, achieves
                 an average speedup of 7x compared to a sequential CPU
                 implementation and outperforms a parallel
                 implementation of the same algorithm running on 16 CPU
                 cores. Our implementation provides general insights
                 into how to produce high-performance GPU
                 implementations of graph algorithms, and it highlights
                 key differences between optimizing parallel programs
                 for multicore CPUs and for GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Merrill:2012:SGG,
  author =       "Duane Merrill and Michael Garland and Andrew
                 Grimshaw",
  title =        "Scalable {GPU} graph traversal",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "117--128",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145832",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Breadth-first search (BFS) is a core primitive for
                 graph traversal and a basis for many higher-level graph
                 analysis algorithms. It is also representative of a
                 class of parallel computations whose memory accesses
                 and work distribution are both irregular and
                 data-dependent. Recent work has demonstrated the
                 plausibility of GPU sparse graph traversal, but has
                 tended to focus on asymptotically inefficient
                 algorithms that perform poorly on graphs with
                 non-trivial diameter. We present a BFS parallelization
                 focused on fine-grained task management constructed
                 from efficient prefix sum that achieves an
                 asymptotically optimal O (| V |+| E |) work complexity.
                 Our implementation delivers excellent performance on
                 diverse graphs, achieving traversal rates in excess of
                 3.3 billion and 8.3 billion traversed edges per second
                 using single and quad-GPU configurations, respectively.
                 This level of performance is several times faster than
                 state-of-the-art implementations both CPU and GPU
                 platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zu:2012:GBN,
  author =       "Yuan Zu and Ming Yang and Zhonghu Xu and Lin Wang and
                 Xin Tian and Kunyang Peng and Qunfeng Dong",
  title =        "{GPU}-based {NFA} implementation for memory efficient
                 high speed regular expression matching",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "129--140",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145833",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Regular expression pattern matching is the foundation
                 and core engine of many network functions, such as
                 network intrusion detection, worm detection, traffic
                 analysis, web applications and so on. DFA-based
                 solutions suffer exponentially exploding state space
                 and cannot be remedied without sacrificing matching
                 speed. Given this scalability problem of DFA-based
                 methods, there has been increasing interest in
                 NFA-based methods for memory efficient regular
                 expression matching. To achieve high matching speed
                 using NFA, it requires potentially massive parallel
                 processing, and hence represents an ideal programming
                 task on Graphic Processor Unit (GPU). Based on in-depth
                 understanding of NFA properties as well as GPU
                 architecture, we propose effective methods for fitting
                 NFAs into GPU architecture through proper data
                 structure and parallel programming design, so that
                 GPU's parallel processing power can be better utilized
                 to achieve high speed regular expression matching.
                 Experiment results demonstrate that, compared with the
                 existing GPU-based NFA implementation method [9], our
                 proposed methods can boost matching speed by 29 to 46
                 times, consistently yielding above 10Gbps matching
                 speed on NVIDIA GTX-460 GPU. Meanwhile, our design only
                 needs a small amount of memory space, growing
                 exponentially more slowly than DFA size. These results
                 make our design an effective solution for memory
                 efficient high speed regular expression matching, and
                 clearly demonstrate the power and potential of GPU as a
                 platform for memory efficient high speed regular
                 expression matching.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kogan:2012:MCF,
  author =       "Alex Kogan and Erez Petrank",
  title =        "A methodology for creating fast wait-free data
                 structures",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "141--150",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145835",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Lock-freedom is a progress guarantee that ensures
                 overall program progress. Wait-freedom is a stronger
                 progress guarantee that ensures the progress of each
                 thread in the program. While many practical lock-free
                 algorithms exist, wait-free algorithms are typically
                 inefficient and hardly used in practice. In this paper,
                 we propose a methodology called fast-path-slow-path for
                 creating efficient wait-free algorithms. The idea is to
                 execute the efficient lock-free version most of the
                 time and revert to the wait-free version only when
                 things go wrong. The generality and effectiveness of
                 this methodology is demonstrated by two examples. In
                 this paper, we apply this idea to a recent construction
                 of a wait-free queue, bringing the wait-free
                 implementation to perform in practice as efficient as
                 the lock-free implementation. In another work, the
                 fast-path-slow-path methodology has been used for
                 (dramatically) improving the performance of a wait-free
                 linked-list.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Prokopec:2012:CTE,
  author =       "Aleksandar Prokopec and Nathan Grasso Bronson and Phil
                 Bagwell and Martin Odersky",
  title =        "Concurrent tries with efficient non-blocking
                 snapshots",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "151--160",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145836",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "We describe a non-blocking concurrent hash trie based
                 on shared-memory single-word compare-and-swap
                 instructions. The hash trie supports standard mutable
                 lock-free operations such as insertion, removal, lookup
                 and their conditional variants. To ensure
                 space-efficiency, removal operations compress the trie
                 when necessary. We show how to implement an efficient
                 lock-free snapshot operation for concurrent hash tries.
                 The snapshot operation uses a single-word
                 compare-and-swap and avoids copying the data structure
                 eagerly. Snapshots are used to implement consistent
                 iterators and a linearizable size retrieval. We compare
                 concurrent hash trie performance with other concurrent
                 data structures and evaluate the performance of the
                 snapshot operation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Crain:2012:SFB,
  author =       "Tyler Crain and Vincent Gramoli and Michel Raynal",
  title =        "A speculation-friendly binary search tree",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "161--170",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145837",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "We introduce the first binary search tree algorithm
                 designed for speculative executions. Prior to this
                 work, tree structures were mainly designed for their
                 pessimistic (non-speculative) accesses to have a
                 bounded complexity. Researchers tried to evaluate
                 transactional memory using such tree structures whose
                 prominent example is the red-black tree library
                 developed by Oracle Labs that is part of multiple
                 benchmark distributions. Although well-engineered, such
                 structures remain badly suited for speculative
                 accesses, whose step complexity might raise
                 dramatically with contention. We show that our
                 speculation-friendly tree outperforms the existing
                 transaction-based version of the AVL and the red-black
                 trees. Its key novelty stems from the decoupling of
                 update operations: they are split into one transaction
                 that modifies the abstraction state and multiple ones
                 that restructure its tree implementation in the
                 background. In particular, the speculation-friendly
                 tree is shown correct, reusable and it speeds up a
                 transaction-based travel reservation application by up
                 to 3.5x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2012:PUA,
  author =       "Yifeng Chen and Xiang Cui and Hong Mei",
  title =        "{PARRAY}: a unifying array representation for
                 heterogeneous parallelism",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "171--180",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145838",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "This paper introduces a programming interface called
                 PARRAY (or Parallelizing ARRAYs) that supports
                 system-level succinct programming for heterogeneous
                 parallel systems like GPU clusters. The current
                 practice of software development requires combining
                 several low-level libraries like Pthread, OpenMP, CUDA
                 and MPI. Achieving productivity and portability is hard
                 with different numbers and models of GPUs. PARRAY
                 extends mainstream C programming with novel array types
                 of distinct features: (1) the dimensions of an array
                 type are nested in a tree, conceptually reflecting the
                 memory hierarchy; (2) the definition of an array type
                 may contain references to other array types, allowing
                 sophisticated array types to be created for
                 parallelization; (3) threads also form arrays that
                 allow programming in a
                 Single-Program-Multiple-Codeblock (SPMC) style to unify
                 various sophisticated communication patterns. This
                 leads to shorter, more portable and maintainable
                 parallel codes, while the programmer still has control
                 over performance-related features necessary for deep
                 manual optimization. Although the source-to-source code
                 generator only faithfully generates low-level library
                 calls according to the type information, higher-level
                 programming and automatic performance optimization are
                 still possible through building libraries of
                 sub-programs on top of PARRAY. The case study on
                 cluster FFT illustrates a simple 30-line code that 2x
                 outperforms Intel Cluster MKL on the Tianhe-1A system
                 with 7168 Fermi GPUs and 14336 CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Blelloch:2012:IDP,
  author =       "Guy E. Blelloch and Jeremy T. Fineman and Phillip B.
                 Gibbons and Julian Shun",
  title =        "Internally deterministic parallel algorithms can be
                 fast",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "181--192",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145840",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The virtues of deterministic parallelism have been
                 argued for decades and many forms of deterministic
                 parallelism have been described and analyzed. Here we
                 are concerned with one of the strongest forms,
                 requiring that for any input there is a unique
                 dependence graph representing a trace of the
                 computation annotated with every operation and value.
                 This has been referred to as internal determinism, and
                 implies a sequential semantics--- i.e., considering any
                 sequential traversal of the dependence graph is
                 sufficient for analyzing the correctness of the code.
                 In addition to returning deterministic results,
                 internal determinism has many advantages including ease
                 of reasoning about the code, ease of verifying
                 correctness, ease of debugging, ease of defining
                 invariants, ease of defining good coverage for testing,
                 and ease of formally, informally and experimentally
                 reasoning about performance. On the other hand one
                 needs to consider the possible downsides of
                 determinism, which might include making algorithms (i)
                 more complicated, unnatural or special purpose and/or
                 (ii) slower or less scalable. In this paper we study
                 the effectiveness of this strong form of determinism
                 through a broad set of benchmark problems. Our main
                 contribution is to demonstrate that for this wide body
                 of problems, there exist efficient internally
                 deterministic algorithms, and moreover that these
                 algorithms are natural to reason about and not
                 complicated to code. We leverage an approach to
                 determinism suggested by Steele (1990), which is to use
                 nested parallelism with commutative operations. Our
                 algorithms apply several diverse programming paradigms
                 that fit within the model including (i) a strict
                 functional style (no shared state among concurrent
                 operations), (ii) an approach we refer to as
                 deterministic reservations, and (iii) the use of
                 commutative, linearizable operations on data
                 structures. We describe algorithms for the benchmark
                 problems that use these deterministic approaches and
                 present performance results on a 32-core machine.
                 Perhaps surprisingly, for all problems, our internally
                 deterministic algorithms achieve good speedup and good
                 performance even relative to prior nondeterministic
                 solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Leiserson:2012:DPR,
  author =       "Charles E. Leiserson and Tao B. Schardl and Jim
                 Sukha",
  title =        "Deterministic parallel random-number generation for
                 dynamic-multithreading platforms",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "193--204",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145841",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Existing concurrency platforms for dynamic
                 multithreading do not provide repeatable parallel
                 random-number generators. This paper proposes that a
                 mechanism called pedigrees be built into the runtime
                 system to enable efficient deterministic parallel
                 random-number generation. Experiments with the
                 open-source MIT Cilk runtime system show that the
                 overhead for maintaining pedigrees is negligible.
                 Specifically, on a suite of 10 benchmarks, the relative
                 overhead of Cilk with pedigrees to the original Cilk
                 has a geometric mean of less than 1\%. We persuaded
                 Intel to modify its commercial C/C++ compiler, which
                 provides the Cilk Plus concurrency platform, to include
                 pedigrees, and we built a library implementation of a
                 deterministic parallel random-number generator called
                 DotMix that compresses the pedigree and then
                 ``RC6-mixes'' the result. The statistical quality of
                 DotMix is comparable to that of the popular Mersenne
                 twister, but somewhat slower than a nondeterministic
                 parallel version of this efficient and high-quality
                 serial random-number generator. The cost of calling
                 DotMix depends on the ``spawn depth'' of the
                 invocation. For a naive Fibonacci calculation with n=40
                 that calls DotMix in every node of the computation,
                 this ``price of determinism'' is a factor of 2.65 in
                 running time, but for more realistic applications with
                 less intense use of random numbers --- such as a
                 maximal-independent-set algorithm, a practical
                 samplesort program, and a Monte Carlo discrete-hedging
                 application from QuantLib --- the observed ``price''
                 was less than 5\%. Moreover, even if overheads were
                 several times greater, applications using DotMix should
                 be amply fast for debugging purposes, which is a major
                 reason for desiring repeatability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nobari:2012:SPM,
  author =       "Sadegh Nobari and Thanh-Tung Cao and Panagiotis Karras
                 and St{\'e}phane Bressan",
  title =        "Scalable parallel minimum spanning forest
                 computation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "205--214",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145842",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The proliferation of data in graph form calls for the
                 development of scalable graph algorithms that exploit
                 parallel processing environments. One such problem is
                 the computation of a graph's minimum spanning forest
                 (MSF). Past research has proposed several parallel
                 algorithms for this problem, yet none of them scales to
                 large, high-density graphs. In this paper we propose a
                 novel, scalable, parallel MSF algorithm for undirected
                 weighted graphs. Our algorithm leverages Prim's
                 algorithm in a parallel fashion, concurrently expanding
                 several subsets of the computed MSF. Our effort focuses
                 on minimizing the communication among different
                 processors without constraining the local growth of a
                 processor's computed subtree. In effect, we achieve a
                 scalability that previous approaches lacked. We
                 implement our algorithm in CUDA, running on a GPU and
                 study its performance using real and synthetic, sparse
                 as well as dense, structured and unstructured graph
                 data. Our experimental study demonstrates that our
                 algorithm outperforms the previous state-of-the-art
                 GPU-based MSF algorithm, while being several orders of
                 magnitude faster than sequential CPU-based
                 algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2012:GCV,
  author =       "Guodong Li and Peng Li and Geof Sawaya and Ganesh
                 Gopalakrishnan and Indradeep Ghosh and Sreeranga P.
                 Rajan",
  title =        "{GKLEE}: concolic verification and test generation for
                 {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "215--224",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145844",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Programs written for GPUs often contain correctness
                 errors such as races, deadlocks, or may compute the
                 wrong result. Existing debugging tools often miss these
                 errors because of their limited input-space and
                 execution-space exploration. Existing tools based on
                 conservative static analysis or conservative modeling
                 of SIMD concurrency generate false alarms resulting in
                 wasted bug-hunting. They also often do not target
                 performance bugs (non-coalesced memory accesses, memory
                 bank conflicts, and divergent warps). We provide a new
                 framework called GKLEE that can analyze C++ GPU
                 programs, locating the aforesaid correctness and
                 performance bugs. For these programs, GKLEE can also
                 automatically generate tests that provide high
                 coverage. These tests serve as concrete witnesses for
                 every reported bug. They can also be used for
                 downstream debugging, for example to test the kernel on
                 the actual hardware. We describe the architecture of
                 GKLEE, its symbolic virtual machine model, and describe
                 previously unknown bugs and performance issues that it
                 detected on commercial SDK kernels. We describe GKLEE's
                 test-case reduction heuristics, and the resulting
                 scalability improvement for a given coverage target.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Du:2012:ABF,
  author =       "Peng Du and Aurelien Bouteiller and George Bosilca and
                 Thomas Herault and Jack Dongarra",
  title =        "Algorithm-based fault tolerance for dense matrix
                 factorizations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "225--234",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145845",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Dense matrix factorizations, such as LU, Cholesky and
                 QR, are widely used for scientific applications that
                 require solving systems of linear equations,
                 eigenvalues and linear least squares problems. Such
                 computations are normally carried out on
                 supercomputers, whose ever-growing scale induces a fast
                 decline of the Mean Time To Failure (MTTF). This paper
                 proposes a new hybrid approach, based on
                 Algorithm-Based Fault Tolerance (ABFT), to help matrix
                 factorizations algorithms survive fail-stop failures.
                 We consider extreme conditions, such as the absence of
                 any reliable component and the possibility of loosing
                 both data and checksum from a single failure. We will
                 present a generic solution for protecting the right
                 factor, where the updates are applied, of all above
                 mentioned factorizations. For the left factor, where
                 the panel has been applied, we propose a scalable
                 checkpointing algorithm. This algorithm features high
                 degree of checkpointing parallelism and cooperatively
                 utilizes the checksum storage leftover from the right
                 factor protection. The fault-tolerant algorithms
                 derived from this hybrid solution is applicable to a
                 wide range of dense matrix factorizations, with minor
                 modifications. Theoretical analysis shows that the
                 fault tolerance overhead sharply decreases with the
                 scaling in the number of computing units and the
                 problem size. Experimental results of LU and QR
                 factorization on the Kraken (Cray XT5) supercomputer
                 validate the theoretical evaluation and confirm
                 negligible overhead, with- and without-errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Buhler:2012:EDA,
  author =       "Jeremy D. Buhler and Kunal Agrawal and Peng Li and
                 Roger D. Chamberlain",
  title =        "Efficient deadlock avoidance for streaming computation
                 with filtering",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "235--246",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145846",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Parallel streaming computations have been studied
                 extensively, and many languages, libraries, and systems
                 have been designed to support this model of
                 computation. In particular, we consider acyclic
                 streaming computations in which individual nodes can
                 choose to filter, or discard, some of their inputs in a
                 data-dependent manner. In these applications, if the
                 channels between nodes have finite buffers, the
                 computation can deadlock. One method of deadlock
                 avoidance is to augment the data streams between nodes
                 with occasional dummy messages; however, for general
                 DAG topologies, no polynomial time algorithm is known
                 to compute the intervals at which dummy messages must
                 be sent to avoid deadlock. In this paper, we show that
                 deadlock avoidance for streaming computations with
                 filtering can be performed efficiently for a large
                 class of DAG topologies. We first present a new method
                 where each dummy message is tagged with a destination,
                 so as to reduce the number of dummy messages sent over
                 the network. We then give efficient algorithms for
                 dummy interval computation in series-parallel DAGs. We
                 finally generalize our results to a larger graph
                 family, which we call the CS4 DAGs, in which every
                 undirected Cycle is Single-Source and Single-Sink (
                 CS$^4$ ). Our results show that, for a large set of
                 application topologies that are both intuitively useful
                 and formalizable, the streaming model with filtering
                 can be implemented safely with reasonable overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dice:2012:LCG,
  author =       "David Dice and Virendra J. Marathe and Nir Shavit",
  title =        "Lock cohorting: a general technique for designing
                 {NUMA} locks",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "247--256",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145848",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Multicore machines are quickly shifting to NUMA and
                 CC-NUMA architectures, making scalable NUMA-aware
                 locking algorithms, ones that take into account the
                 machines' non-uniform memory and caching hierarchy,
                 ever more important. This paper presents lock
                 cohorting, a general new technique for designing
                 NUMA-aware locks that is as simple as it is powerful.
                 Lock cohorting allows one to transform any spin-lock
                 algorithm, with minimal non-intrusive changes, into
                 scalable NUMA-aware spin-locks. Our new cohorting
                 technique allows us to easily create NUMA-aware
                 versions of the TATAS-Backoff, CLH, MCS, and ticket
                 locks, to name a few. Moreover, it allows us to derive
                 a CLH-based cohort abortable lock, the first NUMA-aware
                 queue lock to support abortability. We empirically
                 compared the performance of cohort locks with prior
                 NUMA-aware and classic NUMA-oblivious locks on a
                 synthetic micro-benchmark, a real world key-value store
                 application memcached, as well as the libc memory
                 allocator. Our results demonstrate that cohort locks
                 perform as well or better than known locks when the
                 load is low and significantly out-perform them as the
                 load increases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fatourou:2012:RCS,
  author =       "Panagiota Fatourou and Nikolaos D. Kallimanis",
  title =        "Revisiting the combining synchronization technique",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "257--266",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145849",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Fine-grain thread synchronization has been proved, in
                 several cases, to be outperformed by efficient
                 implementations of the combining technique where a
                 single thread, called the combiner, holding a
                 coarse-grain lock, serves, in addition to its own
                 synchronization request, active requests announced by
                 other threads while they are waiting by performing some
                 form of spinning. Efficient implementations of this
                 technique significantly reduce the cost of
                 synchronization, so in many cases they exhibit much
                 better performance than the most efficient finely
                 synchronized algorithms. In this paper, we revisit the
                 combining technique with the goal to discover where its
                 real performance power resides and whether or how
                 ensuring some desired properties (e.g., fairness in
                 serving requests) would impact performance. We do so by
                 presenting two new implementations of this technique;
                 the first (CC-Synch) addresses systems that support
                 coherent caches, whereas the second (DSM-Synch) works
                 better in cacheless NUMA machines. In comparison to
                 previous such implementations, the new implementations
                 (1) provide bounds on the number of remote memory
                 references (RMRs) that they perform, (2) support a
                 stronger notion of fairness, and (3) use simpler and
                 less basic primitives than previous approaches. In all
                 our experiments, the new implementations outperform by
                 far all previous state-of-the-art combining-based and
                 fine-grain synchronization algorithms. Our experimental
                 analysis sheds light to the questions we aimed to
                 answer. Several modern multi-core systems organize the
                 cores into clusters and provide fast communication
                 within the same cluster and much slower communication
                 across clusters. We present an hierarchical version of
                 CC-Synch, called H-Synch which exploits the
                 hierarchical communication nature of such systems to
                 achieve better performance. Experiments show that
                 H-Synch significantly outper forms previous
                 state-of-the-art hierarchical approaches. We provide
                 new implementations of common shared data structures
                 (like stacks and queues) based on CC-Synch, DSM-Synch
                 and H-Synch. Our experiments show that these
                 implementations outperform by far all previous
                 (fine-grain or combined-based) implementations of
                 shared stacks and queues.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tardieu:2012:WSS,
  author =       "Olivier Tardieu and Haichuan Wang and Haibo Lin",
  title =        "A work-stealing scheduler for {X10}'s task parallelism
                 with suspension",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "267--276",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145850",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The X10 programming language is intended to ease the
                 programming of scalable concurrent and distributed
                 applications. X10 augments a familiar imperative
                 object-oriented programming model with constructs to
                 support light-weight asynchronous tasks as well as
                 execution across multiple address spaces. A crucial
                 aspect of X10's runtime system is the scheduling of
                 concurrent tasks. Work-stealing schedulers have been
                 shown to efficiently load balance fine-grain
                 divide-and-conquer task-parallel program on SMPs and
                 multicores. But X10 is not limited to shared-memory
                 fork-join parallelism. X10 permits tasks to suspend and
                 synchronize by means of conditional atomic blocks and
                 remote task invocations. In this paper, we demonstrate
                 that work-stealing scheduling principles are applicable
                 to a rich programming language such as X10, achieving
                 performance at scale without compromising expressivity,
                 ease of use, or portability. We design and implement a
                 portable work-stealing execution engine for X10. While
                 this engine is biased toward the efficient execution of
                 fork-join parallelism in shared memory, it handles the
                 full X10 language, especially conditional atomic blocks
                 and distribution. We show that this engine improves the
                 run time of a series of benchmark programs by several
                 orders of magnitude when used in combination with the
                 C++ backend compiler and runtime for X10. It achieves
                 scaling comparable to state-of-the art work-stealing
                 scheduler implementations---the Cilk++ compiler and the
                 Java fork/join framework---despite the dramatic
                 increase in generality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Baskaran:2012:ACO,
  author =       "Muthu Manikandan Baskaran and Nicolas Vasilache and
                 Benoit Meister and Richard Lethin",
  title =        "Automatic communication optimizations through memory
                 reuse strategies",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "277--278",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145852",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Modern parallel architectures are emerging with
                 sophisticated hardware consisting of hierarchically
                 placed parallel processors and memories. The properties
                 of memories in a system vary wildly, not only
                 quantitatively (size, latency, bandwidth, number of
                 banks) but also qualitatively (scratchpad, cache).
                 Along with the emergence of such architectures comes
                 the need for effectively utilizing the parallel
                 processors and properly managing data movement across
                 memories to improve memory bandwidth and hide data
                 transfer latency. In this paper, we describe some of
                 the high-level optimizations that are targeted at the
                 improvement of memory performance in the R-Stream
                 compiler, a high-level source-to-source automatic
                 parallelizing compiler. We direct our focus in this
                 paper on optimizing communications (data transfers) by
                 improving memory reuse at various levels of an explicit
                 memory hierarchy. This general concept is well-suited
                 to the hardware properties of GPGPUs, which is the
                 architecture that we concentrate on for this paper. We
                 apply our techniques and obtain performance improvement
                 on various stencil kernels including an important
                 iterative stencil kernel in seismic processing
                 applications where the performance is comparable to
                 that of the state-of-the-art implementation of the
                 kernel by a CUDA expert.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liu:2012:FPA,
  author =       "Gu Liu and Hong An and Wenting Han and Xiaoqiang Li
                 and Tao Sun and Wei Zhou and Xuechao Wei and Xulong
                 Tang",
  title =        "{FlexBFS}: a parallelism-aware implementation of
                 breadth-first search on {GPU}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "279--280",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145853",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In this paper, we present FlexBFS, a parallelism-aware
                 implementation for breadth-first search on GPU. Our
                 implementation can adjust the computation resources
                 according to the feedback of available parallelism
                 dynamically. We also optimized our program in three
                 ways: (1)a simplified two-level queue management,(2)a
                 combined kernel strategy and (3)a high-degree vertices
                 specialization approach. Our experimental results show
                 that it can achieve 3 to 20 times speedup against the
                 fastest serial version, and can outperform the TBB
                 based multi-threading CPU version and the previous most
                 effective GPU version on all types of input graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Andersch:2012:PPE,
  author =       "Michael Andersch and Chi Ching Chi and Ben Juurlink",
  title =        "Programming parallel embedded and consumer
                 applications in {OpenMP} superscalar",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "281--282",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145854",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In this paper, we evaluate the performance and
                 usability of the parallel programming model OpenMP
                 Superscalar (OmpSs), apply it to 10 different
                 benchmarks and compare its performance with
                 corresponding POSIX threads implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhong:2012:OMS,
  author =       "Jianlong Zhong and Bingsheng He",
  title =        "An overview of {Medusa}: simplified graph processing
                 on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "283--284",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145855",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Graphs are the de facto data structures for many
                 applications, and efficient graph processing is a must
                 for the application performance. GPUs have an order of
                 magnitude higher computational power and memory
                 bandwidth compared to CPUs and have been adopted to
                 accelerate several common graph algorithms. However, it
                 is difficult to write correct and efficient GPU
                 programs and even more difficult for graph processing
                 due to the irregularities of graph structures. To
                 address those difficulties, we propose a programming
                 framework named Medusa to simplify graph processing on
                 GPUs. Medusa offers a small set of APIs, based on which
                 developers can define their application logics by
                 writing sequential code without awareness of GPU
                 architectures. The Medusa runtime system automatically
                 executes the developer defined APIs in parallel on the
                 GPU, with a series of graph-centric optimizations. This
                 poster gives an overview of Medusa, and presents some
                 preliminary results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Alias:2012:ORA,
  author =       "Christophe Alias and Alain Darte and Alexandru
                 Plesco",
  title =        "Optimizing remote accesses for offloaded kernels:
                 application to high-level synthesis for {FPGA}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "285--286",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145856",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In the context of the high-level synthesis (HLS) of
                 regular kernels offloaded to FPGA and communicating
                 with an external DDR memory, we show how to
                 automatically generate adequate communicating processes
                 for optimizing the transfer of remote data. This
                 requires a generalized form of communication coalescing
                 where data can be transferred from the external memory
                 even when this memory is not fully up-to-date.
                 Experiments with Altera HLS tools demonstrate that this
                 automatization, based on advanced polyhedral code
                 analysis and code generation techniques, can be used to
                 efficiently map C kernels to FPGA, by generating,
                 entirely at C level, all the necessary glue (the
                 communication processes), which is compiled with the
                 same HLS tool as for the computation kernel.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tao:2012:UGA,
  author =       "Jian Tao and Marek Blazewicz and Steven R. Brandt",
  title =        "Using {GPU}'s to accelerate stencil-based computation
                 kernels for the development of large scale scientific
                 applications on heterogeneous systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "287--288",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145857",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "We present CaCUDA --- a GPGPU kernel abstraction and a
                 parallel programming framework for developing highly
                 efficient large scale scientific applications using
                 stencil computations on hybrid CPU/GPU architectures.
                 CaCUDA is built upon the Cactus computational toolkit,
                 an open source problem solving environment designed for
                 scientists and engineers. Due to the flexibility and
                 extensibility of the Cactus toolkit, the addition of a
                 GPGPU programming framework required no changes to the
                 Cactus infrastructure, guaranteeing that existing
                 features and modules will continue to work without
                 modification. CaCUDA was tested and benchmarked using a
                 3D CFD code based on a finite difference discretization
                 of Navier--Stokes equations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Marker:2012:MED,
  author =       "Bryan Marker and Andy Terrel and Jack Poulson and Don
                 Batory and Robert van de Geijn",
  title =        "Mechanizing the expert dense linear algebra
                 developer",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "289--290",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145858",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The efforts of an expert to parallelize and optimize a
                 dense linear algebra algorithm for distributed-memory
                 targets are largely mechanical and repetitive. We
                 demonstrate that these efforts can be encoded and
                 automatically applied to obviate the manual
                 implementation of many algorithms in high-performance
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nugteren:2012:BHM,
  author =       "Cedric Nugteren and Henk Corporaal",
  title =        "The boat hull model: adapting the roofline model to
                 enable performance prediction for parallel computing",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "291--292",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145859",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Multi-core and many-core were already major trends for
                 the past six years, and are expected to continue for
                 the next decades. With these trends of parallel
                 computing, it becomes increasingly difficult to decide
                 on which architecture to run a given application. In
                 this work, we use an algorithm classification to
                 predict performance prior to algorithm implementation.
                 For this purpose, we modify the roofline model to
                 include class information. In this way, we enable
                 architectural choice through performance prediction
                 prior to the development of architecture specific code.
                 The new model, the boat hull model, is demonstrated
                 using a GPU as a target architecture. We show for 6
                 example algorithms that performance is predicted
                 accurately without requiring code to be available.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Feng:2012:SPG,
  author =       "Min Feng and Rajiv Gupta and Laxmi N. Bhuyan",
  title =        "Speculative parallelization on {GPGPUs}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "293--294",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145860",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "This paper overviews the first speculative
                 parallelization technique for GPUs that can exploit
                 parallelism in loops even in the presence of dynamic
                 irregularities that may give rise to cross-iteration
                 dependences. The execution of a speculatively
                 parallelized loop consists of five phases: scheduling,
                 computation, misspeculation check, result committing,
                 and misspeculation recovery. We perform misspeculation
                 check on the GPU to minimize its cost. We optimize the
                 procedures of result committing and misspeculation
                 recovery to reduce the result copying and recovery
                 overhead. Finally, the scheduling policies are designed
                 according to the types of cross-iteration dependences
                 to reduce the misspeculation rate. Our preliminary
                 evaluation was conducted on an nVidia Tesla C1060
                 hosted in an Intel(R) Xeon(R) E5540 machine. We use
                 three benchmarks of which two contain irregular memory
                 accesses and one contain irregular control flows that
                 can give rise to cross-iteration dependences. Our
                 implementation achieves 3.6x-13.8x speedups for loops
                 in these benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jimborean:2012:APM,
  author =       "Alexandra Jimborean and Philippe Clauss and
                 Beno{\^\i}t Pradelle and Luis Mastrangelo and Vincent
                 Loechner",
  title =        "Adapting the polyhedral model as a framework for
                 efficient speculative parallelization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "295--296",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145861",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In this paper, we present a Thread-Level Speculation
                 (TLS) framework whose main feature is to be able to
                 speculatively parallelize a sequential loop nest in
                 various ways, by re-scheduling its iterations. The
                 transformation to be applied is selected at runtime
                 with the goal of minimizing the number of rollbacks and
                 maximizing performance. We perform code transformations
                 by applying the polyhedral model that we adapted for
                 speculative and runtime code parallelization. For this
                 purpose, we design a parallel code pattern which is
                 patched by our runtime system according to the
                 profiling information collected on some execution
                 samples. Adaptability is ensured by considering chunks
                 of code of various sizes, that are launched
                 successively, each of which being parallelized in a
                 different manner, or run sequentially, depending on the
                 currently observed behavior for accessing memory. We
                 show on several benchmarks that our framework yields
                 good performance on codes which could not be handled
                 efficiently by previously proposed TLS systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gong:2012:OCN,
  author =       "Yifan Gong and Bingsheng He and Jianlong Zhong",
  title =        "An overview of {CMPI}: network performance aware {MPI}
                 in the cloud",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "297--298",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145862",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Cloud computing enables users to perform distributed
                 computing tasks on many virtual machines, without
                 owning a physical cluster. Recently, various
                 distributed computing tasks such as scientific
                 applications are being moved from supercomputers and
                 private clusters to public clouds. Message passing
                 interface (MPI) is a key and common component in
                 distributed computing tasks. The virtualized computing
                 environment of the public cloud hides the network
                 topology information from the users, and existing
                 topology-aware optimizations for MPI are no longer
                 feasible in the cloud environment. We propose a network
                 performance aware MPI library named CMPI. CMPI embraces
                 a new model for capturing the network performance among
                 different virtual machines in the cloud. Based on the
                 network performance model, we develop novel network
                 performance aware algorithms for communication
                 operations. This poster gives an overview of CMPI
                 design, and presents some preliminary results on
                 collective operations such as broadcast.We demonstrate
                 the effectiveness of our network performance aware
                 optimizations on Amazon EC2.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kim:2012:OUP,
  author =       "Jungwon Kim and Sangmin Seo and Jun Lee and Jeongho
                 Nah and Gangwon Jo and Jaejin Lee",
  title =        "{OpenCL} as a unified programming model for
                 heterogeneous {CPU\slash GPU} clusters",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "299--300",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145863",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In this paper, we propose an OpenCL framework for
                 heterogeneous CPU/GPU clusters, and show that the
                 framework achieves both high performance and ease of
                 programming. The framework provides an illusion of a
                 single system for the user. It allows the application
                 to utilize multiple heterogeneous compute devices, such
                 as multicore CPUs and GPUs, in a remote node as if they
                 were in a local node. No communication API, such as the
                 MPI library, is required in the application source. We
                 implement the OpenCL framework and evaluate its
                 performance on a heterogeneous CPU/GPU cluster that
                 consists of one host node and nine compute nodes using
                 eleven OpenCL benchmark applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tzenakis:2012:BBL,
  author =       "George Tzenakis and Angelos Papatriantafyllou and John
                 Kesapides and Polyvios Pratikakis and Hans
                 Vandierendonck and Dimitrios S. Nikolopoulos",
  title =        "{BDDT}: block-level dynamic dependence analysis for
                 deterministic task-based parallelism",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "301--302",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145864",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kamil:2012:PPP,
  author =       "Shoaib Kamil and Derrick Coetzee and Scott Beamer and
                 Henry Cook and Ekaterina Gonina and Jonathan Harper and
                 Jeffrey Morlan and Armando Fox",
  title =        "Portable parallel performance from sequential,
                 productive, embedded domain-specific languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "303--304",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145865",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Domain-expert productivity programmers desire scalable
                 application performance, but usually must rely on
                 efficiency programmers who are experts in explicit
                 parallel programming to achieve it. Since such
                 programmers are rare, to maximize reuse of their work
                 we propose encapsulating their strategies in
                 mini-compilers for domain-specific embedded languages
                 (DSELs) glued together by a common high-level host
                 language familiar to productivity programmers. The
                 nontrivial applications that use these DSELs perform up
                 to 98\% of peak attainable performance, and comparable
                 to or better than existing hand-coded implementations.
                 Our approach is unique in that each mini-compiler not
                 only performs conventional compiler transformations and
                 optimizations, but includes imperative procedural code
                 that captures an efficiency expert's strategy for
                 mapping a narrow domain onto a specific type of
                 hardware. The result is source- and
                 performance-portability for productivity programmers
                 and parallel performance that rivals that of hand-coded
                 efficiency-language implementations of the same
                 applications. We describe a framework that supports our
                 methodology and five implemented DSELs supporting
                 common computation kernels. Our results demonstrate
                 that for several interesting classes of problems,
                 efficiency-level parallel performance can be achieved
                 by packaging efficiency programmers' expertise in a
                 reusable framework that is easy to use for both
                 productivity programmers and efficiency programmers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hoefler:2012:CCO,
  author =       "Torsten Hoefler and Timo Schneider",
  title =        "Communication-centric optimizations by dynamically
                 detecting collective operations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "305--306",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145866",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The steady increase of parallelism in high-performance
                 computing platforms implies that communication will be
                 most important in large-scale applications. In this
                 work, we tackle the problem of transparent optimization
                 of large-scale communication patterns using online
                 compilation techniques. We utilize the Group Operation
                 Assembly Language (GOAL), an abstract parallel dataflow
                 definition language, to specify our transformations in
                 a device-independent manner. We develop fast schemes
                 that analyze dataflow and synchronization semantics in
                 GOAL and detect if parts of the (or the whole)
                 communication pattern express a known collective
                 communication operation. The detection of collective
                 operations allows us to replace the detected patterns
                 with highly optimized algorithms or low-level hardware
                 calls and thus improve performance significantly.
                 Benchmark results suggest that our technique can lead
                 to a performance improvement of orders of magnitude
                 compared with various optimized algorithms written in
                 Co-Array Fortran. Detecting collective operations also
                 improves the programmability of parallel languages in
                 that the user does not have to understand the detailed
                 semantics of high-level communication operations in
                 order to generate efficient and scalable code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhang:2012:LLF,
  author =       "Donghui Zhang and Per-{\AA}ke Larson",
  title =        "{LHlf}: lock-free linear hashing (poster paper)",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "307--308",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145868",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "LHlf is a new hash table designed to allow very high
                 levels of concurrency. The table is lock free and grows
                 and shrinks auto-matically according to the number of
                 items in the table. Insertions, lookups and deletions
                 are never blocked. LHlf is based on linear hashing but
                 adopts recursive split-ordering of the items within a
                 bucket to be able to split and merge lists in a lock
                 free manner. LHlf is as fast as the best previous
                 lock-free design and in addition it offers stable
                 performance, uses less space, and supports both
                 expansions and contractions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Timnat:2012:WFL,
  author =       "Shahar Timnat and Anastasia Braginsky and Alex Kogan
                 and Erez Petrank",
  title =        "Wait-free linked-lists",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "309--310",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145869",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The linked-list data structure is fundamental and
                 ubiquitous. Lock-free versions of the linked-list are
                 well known. However, the existence of a practical
                 wait-free linked-list has been open. In this work we
                 designed such a linked-list. To achieve better
                 performance, we have also extended this design using
                 the fast-path-slow-path methodology. The resulting
                 implementation achieves performance which is
                 competitive with that of Harris's lock-free list, while
                 still guaranteeing non-starvation via wait-freedom. We
                 have also developed a proof for the correctness and the
                 wait-freedom of our design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dinh:2012:SPD,
  author =       "Minh Ngoc Dinh and David Abramson and Chao Jin and
                 Andrew Gontarek and Bob Moench and Luiz DeRose",
  title =        "Scalable parallel debugging with statistical
                 assertions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "311--312",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145870",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Traditional debuggers are of limited value for modern
                 scientific codes that manipulate large complex data
                 structures. This paper discusses a novel debug-time
                 assertion, called a ``Statistical Assertion'', that
                 allows a user to reason about large data structures,
                 and the primitives are parallelised to provide an
                 efficient solution. We present the design and
                 implementation of statistical assertions, and
                 illustrate the debugging technique with a molecular
                 dynamics simulation. We evaluate the performance of the
                 tool on a 12,000 cores Cray XE6.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Malkis:2012:VSB,
  author =       "Alexander Malkis and Anindya Banerjee",
  title =        "Verification of software barriers",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "313--314",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145871",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "This paper describes frontiers in verification of the
                 software barrier synchronization primitive. So far most
                 software barrier algorithms have not been mechanically
                 verified. We show preliminary results in automatically
                 proving the correctness of the major software
                 barriers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mittal:2012:CAS,
  author =       "Anshul Mittal and Nikhil Jain and Thomas George and
                 Yogish Sabharwal and Sameer Kumar",
  title =        "Collective algorithms for sub-communicators",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "315--316",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145872",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Collective communication over a group of processors is
                 an integral and time consuming component in many HPC
                 applications. Many modern day supercomputers are based
                 on torus interconnects. On such systems, for an
                 irregular communicator comprising of a subset of
                 processors, the algorithms developed so far are not
                 contention free in general and hence non-optimal. In
                 this paper, we present a novel contention-free
                 algorithm to perform collective operations over a
                 subset of processors in a torus network. We also extend
                 previous work on regular communicators to handle
                 special cases of irregular communicators that occur
                 frequently in parallel scientific applications. For the
                 generic case where multiple node disjoint
                 sub-communicators communicate simultaneously in a
                 loosely synchronous fashion, we propose a novel
                 cooperative approach to route the data for individual
                 sub-communicators without contention. Empirical results
                 demonstrate that our algorithms outperform the
                 optimized MPI collective implementation on IBM's Blue
                 Gene/P supercomputer for large data sizes and random
                 node distributions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{DeKoster:2012:SVE,
  author =       "Joeri {De Koster} and Stefan Marr and Theo D'Hondt",
  title =        "Synchronization views for event-loop actors",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "317--318",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145873",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "The actor model has already proven itself as an
                 interesting concurrency model that avoids issues such
                 as deadlocks and race conditions by construction, and
                 thus facilitates concurrent programming. The tradeoff
                 is that it sacrifices expressiveness and efficiency
                 especially with respect to data parallelism. However,
                 many standard solutions to computationally expensive
                 problems employ data parallel algorithms for better
                 performance on parallel systems. We identified three
                 problems that inhibit the use of data-parallel
                 algorithms within the actor model. Firstly, one of the
                 main properties of the actor model, the fact that no
                 data is shared, is one of the most severe performance
                 bottlenecks. Especially the fact that shared state can
                 not be read truly in parallel. Secondly, the actor
                 model on its own does not provide a mechanism to
                 specify extra synchronization conditions on batches of
                 messages which leads to event-level data-races. And
                 lastly, programmers are forced to write code in a
                 continuation-passing style (CPS) to handle typical
                 request-response situations. However, CPS breaks the
                 sequential flow of the code and is often hard to
                 understand, which increases complexity and lowers
                 maintainability. We proposes synchronization views to
                 solve these three issues without compromising the
                 semantic properties of the actor model. Thus, the
                 resulting concurrency model maintains deadlock-freedom,
                 avoids low-level race conditions, and keeps the
                 semantics of macro-step execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Metreveli:2012:CCP,
  author =       "Zviad Metreveli and Nickolai Zeldovich and M. Frans
                 Kaashoek",
  title =        "{CPHASH}: a cache-partitioned hash table",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "319--320",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145874",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "CPHash is a concurrent hash table for multicore
                 processors. CPHash partitions its table across the
                 caches of cores and uses message passing to transfer
                 lookups\slash inserts to a partition. CPHash's message
                 passing avoids the need for locks, pipelines batches of
                 asynchronous messages, and packs multiple messages into
                 a single cache line transfer. Experiments on a 80-core
                 machine with 2 hardware threads per core show that
                 CPHash has $ \approx 1.6 \times $ higher throughput
                 than a hash table implemented using fine-grained locks.
                 An analysis shows that CPHash wins because it
                 experiences fewer cache misses and its cache misses are
                 less expensive, because of less contention for the
                 on-chip interconnect and DRAM. CPServer, a key\slash
                 value cache server using CPHash, achieves $ \approx 5
                 \% $ higher throughput than a key\slash value cache
                 server that uses a hash table with fine-grained locks,
                 but both achieve better throughput and scalability than
                 memcached. The throughput of CPHash and CPServer also
                 scale near-linearly with the number of cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wernsing:2012:RHA,
  author =       "John R. Wernsing and Greg Stitt",
  title =        "{RACECAR}: a heuristic for automatic function
                 specialization on multi-core heterogeneous systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "321--322",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145875",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "High-performance computing systems increasingly
                 combine multi-core processors and heterogeneous
                 resources such as graphics-processing units and
                 field-programmable gate arrays. However, significant
                 application design complexity for such systems has
                 often led to untapped performance potential.
                 Application designers targeting such systems currently
                 must determine how to parallelize computation, create
                 device-specialized implementations for each
                 heterogeneous resource, and determine how to partition
                 work for each resource. In this paper, we present the
                 RACECAR heuristic to automate the optimization of
                 applications for multi-core heterogeneous systems by
                 automatically exploring implementation alternatives
                 that include different algorithms, parallelization
                 strategies, and work distributions. Experimental
                 results show RACECAR-specialized implementations
                 achieve speedups up to 117x and average 11x compared to
                 a single CPU thread when parallelizing computation
                 across multiple cores, graphics-processing units, and
                 field-programmable gate arrays.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liu:2012:LFA,
  author =       "Yujie Liu and Michael Spear",
  title =        "A lock-free, array-based priority queue",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "323--324",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145876",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Noll:2012:IDO,
  author =       "Albert Noll and Thomas R. Gross",
  title =        "An infrastructure for dynamic optimization of parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "325--326",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145877",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Object-oriented programming languages like Java
                 provide only low-level constructs (e.g., starting a
                 thread) to describe concurrency. High-level
                 abstractions (e.g., thread pools) are merely provided
                 as a library. As a result, a compiler is not aware of
                 the high-level semantics of a parallel library and
                 therefore misses important optimization opportunities.
                 This paper presents a simple source language extension
                 based on which a compiler is provided with the
                 opportunity to perform new optimizations that are
                 particularly effective for parallel code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kjolstad:2012:ADG,
  author =       "Fredrik Kjolstad and Torsten Hoefler and Marc Snir",
  title =        "Automatic datatype generation and optimization",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "327--328",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145878",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Many high performance applications spend considerable
                 time packing noncontiguous data into contiguous
                 communication buffers. MPI Datatypes provide an
                 alternative by describing noncontiguous data layouts.
                 This allows sophisticated hardware to retrieve data
                 directly from application data structures. However,
                 packing codes in real-world applications are often
                 complex and specifying equivalent datatypes is
                 difficult, time-consuming, and error prone. We present
                 an algorithm that automates the transformation. We have
                 implemented the algorithm in a tool that transforms
                 packing code to MPI Datatypes, and evaluated it by
                 transforming 90 packing codes from the NAS Parallel
                 Benchmarks. The transformation allows easy porting of
                 applications to new machines that benefit from
                 datatypes, thus improving programmer productivity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Burnim:2012:NIN,
  author =       "Jacob Burnim and Tayfun Elmas and George Necula and
                 Koushik Sen",
  title =        "{NDetermin}: inferring nondeterministic sequential
                 specifications for parallelism correctness",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "329--330",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145879",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Nondeterministic Sequential (NDSeq) specifications
                 have been proposed as a means for separating the
                 testing, debugging, and verifying of a program's
                 parallelism correctness and its sequential functional
                 correctness. In this work, we present a technique that,
                 given a few representative executions of a parallel
                 program, combines dynamic data flow analysis and
                 Minimum-Cost Boolean Satisfiability (MinCostSAT)
                 solving for automatically inferring a likely NDSeq
                 specification for the parallel program. For a number of
                 Java benchmarks, our tool NDetermin infers equivalent
                 or stronger NDSeq specifications than those previously
                 written manually.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Park:2012:CB,
  author =       "Chang-Seo Park and Koushik Sen",
  title =        "Concurrent breakpoints",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "331--332",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145880",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "In program debugging, reproducibility of bugs is a key
                 requirement. Unfortunately, bugs in concurrent programs
                 are notoriously difficult to reproduce because bugs due
                 to concurrency happen under very specific thread
                 schedules and the likelihood of taking such corner-case
                 schedules during regular testing is very low. We
                 propose concurrent breakpoints, a light-weight and
                 programmatic way to make a concurrency bug
                 reproducible. We describe a mechanism that helps to hit
                 a concurrent breakpoint in a concurrent execution with
                 high probability. We have implemented concurrent
                 breakpoints as a light-weight library for Java and
                 C/C++ programs. We have used the implementation to
                 deterministically reproduce several known
                 non-deterministic bugs in real-world concurrent Java
                 and C/C++ programs with almost 100\% probability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Stone:2012:EMP,
  author =       "Andrew Stone and John Dennis and Michelle Strout",
  title =        "Establishing a {Miniapp} as a programmability proxy",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "333--334",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145881",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "Miniapps serve as test beds for prototyping and
                 evaluating new algorithms, data structures, and
                 programming models before incorporating such changes
                 into larger applications. For the miniapp to accurately
                 predict how a prototyped change would affect a larger
                 application it is necessary that the miniapp be shown
                 to serve as a proxy for that larger application.
                 Although many benchmarks claim to proxy the performance
                 for a set of large applications, little work has
                 explored what criteria must be met for a benchmark to
                 serve as a proxy for examining programmability. In this
                 poster we describe criteria that can be used to
                 establish that a miniapp serves as a performance and
                 programmability proxy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jiang:2012:OSP,
  author =       "Lei Jiang and Pragneshkumar B. Patel and George
                 Ostrouchov and Ferdinand Jamitzky",
  title =        "{OpenMP}-style parallelism in data-centered multicore
                 computing with {R}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "335--336",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145882",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "R$^1$ is a domain specific language widely used for
                 data analysis by the statistics community as well as by
                 researchers in finance, biology, social sciences, and
                 many other disciplines. As R programs are linked to
                 input data, the exponential growth of available data
                 makes high-performance computing with R imperative. To
                 ease the process of writing parallel programs in R,
                 code transformation from a sequential program to a
                 parallel version would bring much convenience to R
                 users. In this paper, we present our work in
                 semi-automatic parallelization of R codes with
                 user-added OpenMP-style pragmas. While such pragmas are
                 used at the frontend, we take advantage of multiple
                 parallel backends with different R packages. We provide
                 flexibility for importing parallelism with plug-in
                 components, impose built-in MapReduce for data
                 processing, and also maintain code reusability. We
                 illustrate the advantage of the on-the-fly mechanisms
                 which can lead to significant applications in
                 data-centered parallel computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Caniou:2012:PAP,
  author =       "Yves Caniou and Daniel Diaz and Florian Richoux and
                 Philippe Codognet and Salvador Abreu",
  title =        "Performance analysis of parallel constraint-based
                 local search",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "8",
  pages =        "337--338",
  month =        aug,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2370036.2145883",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Sep 12 12:11:57 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPOPP '12 conference proceedings.",
  abstract =     "We present a parallel implementation of a
                 constraint-based local search algorithm and investigate
                 its performance results for hard combinatorial
                 optimization problems on two different platforms up to
                 several hundreds of cores. On a variety of classical
                 CSPs benchmarks, speedups are very good for a few tens
                 of cores, and good up to a hundred cores. More
                 challenging problems derived from real-life
                 applications (Costas array) shows even better speedups,
                 nearly optimal up to 256 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Thiemann:2012:ACE,
  author =       "Peter Thiemann",
  title =        "{AGDA}-curious?: an exploration of programming with
                 dependent types",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "1--2",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "I explore programming with the dependently typed
                 functional language, AGDA. I present the progress which
                 AGDA has made, demonstrate its usage in a small
                 development, reflect critically on the state of the
                 art, and speculate about the way ahead. I do not seek
                 to persuade you to adopt AGDA as your primary tool for
                 systems development, but argue that AGDA stimulates new
                 useful ways to think about programming problems and
                 deserves not just curiosity but interest, support and
                 contribution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Stewart:2012:VHT,
  author =       "Gordon Stewart and Lennart Beringer and Andrew W.
                 Appel",
  title =        "Verified heap theorem prover by paramodulation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "3--14",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present VeriStar, a verified theorem prover for a
                 decidable subset of separation logic. Together with
                 VeriSmall [3], a proved-sound Smallfoot-style program
                 analysis for C minor, VeriStar demonstrates that fully
                 machine-checked static analyses equipped with efficient
                 theorem provers are now within the reach of formal
                 methods. As a pair, VeriStar and VeriSmall represent
                 the first application of the Verified Software
                 Toolchain [4], a tightly integrated collection of
                 machine-verified program logics and compilers giving
                 foundational correctness guarantees. VeriStar is (1)
                 purely functional, (2) machine-checked, (3) end-to-end,
                 (4) efficient and (5) modular. By purely functional, we
                 mean it is implemented in Gallina, the pure functional
                 programming language embedded in the Coq theorem
                 prover. By machine-checked, we mean it has a proof in
                 Coq that when the prover says ``valid'', the checked
                 entailment holds in a proved-sound separation logic for
                 C minor. By end-to-end, we mean that when the static
                 analysis+theorem prover says a C minor program is safe,
                 the program will be compiled to a semantically
                 equivalent assembly program that runs on real hardware.
                 By efficient, we mean that the prover implements a
                 state-of-the-art algorithm for deciding heap
                 entailments and uses highly tuned verified functional
                 data structures. By modular, we mean that VeriStar can
                 be retrofitted to other static analyses as a
                 plug-compatible entailment checker and its soundness
                 proof can easily be ported to other separation
                 logics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Huffman:2012:FVM,
  author =       "Brian Huffman",
  title =        "Formal verification of monad transformers",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "15--16",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364532",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present techniques for reasoning about constructor
                 classes that (like the monad class) fix polymorphic
                 operations and assert polymorphic axioms. We do not
                 require a logic with first-class type constructors,
                 first-class polymorphism, or type quantification;
                 instead, we rely on a domain-theoretic model of the
                 type system in a universal domain to provide these
                 features. These ideas are implemented in the Tycon
                 library for the Isabelle theorem prover, which builds
                 on the HOLCF library of domain theory. The Tycon
                 library provides various axiomatic type constructor
                 classes, including functors and monads. It also
                 provides automation for instantiating those classes,
                 and for defining further subclasses. We use the Tycon
                 library to formalize three Haskell monad transformers:
                 the error transformer, the writer transformer, and the
                 resumption transformer. The error and writer
                 transformers do not universally preserve the monad
                 laws; however, we establish datatype invariants for
                 each, showing that they are valid monads when viewed as
                 abstract datatypes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Dunfield:2012:EIU,
  author =       "Joshua Dunfield",
  title =        "Elaborating intersection and union types",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "17--28",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Designing and implementing typed programming languages
                 is hard. Every new type system feature requires
                 extending the metatheory and implementation, which are
                 often complicated and fragile. To ease this process, we
                 would like to provide general mechanisms that subsume
                 many different features. In modern type systems,
                 parametric polymorphism is fundamental, but
                 intersection polymorphism has gained little traction in
                 programming languages. Most practical intersection type
                 systems have supported only refinement intersections,
                 which increase the expressiveness of types (more
                 precise properties can be checked) without altering the
                 expressiveness of terms; refinement intersections can
                 simply be erased during compilation. In contrast,
                 unrestricted intersections increase the expressiveness
                 of terms, and can be used to encode diverse language
                 features, promising an economy of both theory and
                 implementation. We describe a foundation for compiling
                 unrestricted intersection and union types: an
                 elaboration type system that generates ordinary $
                 \lambda $-calculus terms. The key feature is a
                 Forsythe-like merge construct. With this construct, not
                 all reductions of the source program preserve types;
                 however, we prove that ordinary call-by-value
                 evaluation of the elaborated program corresponds to a
                 type-preserving evaluation of the source program. We
                 also describe a prototype implementation and
                 applications of unrestricted intersections and unions:
                 records, operator overloading, and simulating dynamic
                 typing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Chen:2012:ETT,
  author =       "Sheng Chen and Martin Erwig and Eric Walkingshaw",
  title =        "An error-tolerant type system for variational lambda
                 calculus",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "29--40",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364535",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Conditional compilation and software product line
                 technologies make it possible to generate a huge number
                 of different programs from a single software project.
                 Typing each of these programs individually is usually
                 impossible due to the sheer number of possible
                 variants. Our previous work has addressed this problem
                 with a type system for variational lambda calculus
                 (VLC), an extension of lambda calculus with basic
                 constructs for introducing and organizing variation.
                 Although our type inference algorithm is more efficient
                 than the brute-force strategy of inferring the types of
                 each variant individually, it is less robust since type
                 inference will fail for the entire variational
                 expression if any one variant contains a type error. In
                 this work, we extend our type system to operate on VLC
                 expressions containing type errors. This extension
                 directly supports locating ill-typed variants and the
                 incremental development of variational programs. It
                 also has many subtle implications for the unification
                 of variational types. We show that our extended type
                 system possesses a principal typing property and that
                 the underlying unification problem is unitary. Our
                 unification algorithm computes partial unifiers that
                 lead to result types that (1) contain errors in as few
                 variants as possible and (2) are most general. Finally,
                 we perform an empirical evaluation to determine the
                 overhead of this extension compared to our previous
                 work, to demonstrate the improvements over the
                 brute-force approach, and to explore the effects of
                 various error distributions on the inference process.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Krishnaswami:2012:SST,
  author =       "Neelakantan R. Krishnaswami and Aaron Turon and Derek
                 Dreyer and Deepak Garg",
  title =        "Superficially substructural types",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "41--54",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many substructural type systems have been proposed for
                 controlling access to shared state in higher-order
                 languages. Central to these systems is the notion of a
                 *resource*, which may be split into disjoint pieces
                 that different parts of a program can manipulate
                 independently without worrying about interfering with
                 one another. Some systems support a *logical* notion of
                 resource (such as permissions), under which two
                 resources may be considered disjoint even if they
                 govern the *same* piece of state. However, in nearly
                 all existing systems, the notions of resource and
                 disjointness are fixed at the outset, baked into the
                 model of the language, and fairly coarse-grained in the
                 kinds of sharing they enable. In this paper, inspired
                 by recent work on ``fictional disjointness'' in
                 separation logic, we propose a simple and flexible way
                 of enabling any module in a program to create its own
                 custom type of splittable resource (represented as a
                 commutative monoid), thus providing fine-grained
                 control over how the module's private state is shared
                 with its clients. This functionality can be
                 incorporated into an otherwise standard substructural
                 type system by means of a new typing rule we call *the
                 sharing rule*, whose soundness we prove semantically
                 via a novel resource-oriented Kripke logical
                 relation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Mitchell:2012:SBB,
  author =       "Neil Mitchell",
  title =        "Shake before building: replacing {\tt make} with
                 {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "55--66",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364538",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most complex software projects are compiled using a
                 build tool (e.g. make), which runs commands in an order
                 satisfying user-defined dependencies. Unfortunately,
                 most build tools require all dependencies to be
                 specified before the build starts. This restriction
                 makes many dependency patterns difficult to express,
                 especially those involving files generated at build
                 time. We show how to eliminate this restriction,
                 allowing additional dependencies to be specified while
                 building. We have implemented our ideas in the Haskell
                 library Shake, and have used Shake to write a complex
                 build system which compiles millions of lines of
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Chitil:2012:PTL,
  author =       "Olaf Chitil",
  title =        "Practical typed lazy contracts",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "67--76",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Until now there has been no support for specifying and
                 enforcing contracts within a lazy functional program.
                 That is a shame, because contracts consist of pre- and
                 post-conditions for functions that go beyond the
                 standard static types. This paper presents the design
                 and implementation of a small, easy-to-use, purely
                 functional contract library for Haskell, which, when a
                 contract is violated, also provides more useful
                 information than the classical blaming of one contract
                 partner. From now on lazy functional languages can
                 profit from the assurances in the development of
                 correct programs that contracts provide.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Oliveira:2012:FPS,
  author =       "Bruno C.d.S. Oliveira and William R. Cook",
  title =        "Functional programming with structured graphs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "77--88",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364541",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a new functional programming model
                 for graph structures called structured graphs.
                 Structured graphs extend conventional algebraic
                 datatypes with explicit definition and manipulation of
                 cycles and/or sharing, and offer a practical and
                 convenient way to program graphs in functional
                 programming languages like Haskell. The representation
                 of sharing and cycles (edges) employs recursive binders
                 and uses an encoding inspired by parametric
                 higher-order abstract syntax. Unlike traditional
                 approaches based on mutable references or node/edge
                 lists, well-formedness of the graph structure is
                 ensured statically and reasoning can be done with
                 standard functional programming techniques. Since the
                 binding structure is generic, we can define many useful
                 generic combinators for manipulating structured graphs.
                 We give applications and show how to reason about
                 structured graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Sheard:2012:PPC,
  author =       "Timothy E. Sheard",
  title =        "Painless programming combining reduction and search:
                 design principles for embedding decision procedures in
                 high-level languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "89--102",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364542",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe the Funlogic system which extends a
                 functional language with existentially quantified
                 declarations. An existential declaration introduces a
                 variable and a set of constraints that its value should
                 meet. Existential variables are bound to conforming
                 values by a decision procedure. Funlogic embeds
                 multiple external decision procedures using a common
                 framework. Design principles for embedding decision
                 procedures are developed and illustrated for three
                 different decision procedures from widely varying
                 domains.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Dagand:2012:TFA,
  author =       "Pierre-Evariste Dagand and Conor McBride",
  title =        "Transporting functions across ornaments",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "103--114",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364544",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming with dependent types is a blessing and a
                 curse. It is a blessing to be able to bake invariants
                 into the definition of datatypes: we can finally write
                 correct-by-construction software. However, this extreme
                 accuracy is also a curse: a datatype is the combination
                 of a structuring medium together with a special purpose
                 logic. These domain-specific logics hamper any effort
                 of code reuse among similarly structured data. In this
                 paper, we exorcise our datatypes by adapting the notion
                 of ornament to our universe of inductive families. We
                 then show how code reuse can be achieved by ornamenting
                 functions. Using these functional ornaments, we capture
                 the relationship between functions such as the addition
                 of natural numbers and the concatenation of lists. With
                 this knowledge, we demonstrate how the implementation
                 of the former informs the implementation of the latter:
                 the user can ask the definition of addition to be
                 lifted to lists and she will only be asked the details
                 necessary to carry on adding lists rather than numbers.
                 Our presentation is formalised in a type theory with a
                 universe of datatypes and all our constructions have
                 been implemented as generic programs, requiring no
                 extension to the type theory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Myreen:2012:PPS,
  author =       "Magnus O. Myreen and Scott Owens",
  title =        "Proof-producing synthesis of {ML} from higher-order
                 logic",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "115--126",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364545",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The higher-order logic found in proof assistants such
                 as Coq and various HOL systems provides a convenient
                 setting for the development and verification of pure
                 functional programs. However, to efficiently run these
                 programs, they must be converted (or ``extracted'') to
                 functional programs in a programming language such as
                 ML or Haskell. With current techniques, this step,
                 which must be trusted, relates similar looking objects
                 that have very different semantic definitions, such as
                 the set-theoretic model of a logic and the operational
                 semantics of a programming language. In this paper, we
                 show how to increase the trustworthiness of this step
                 with an automated technique. Given a functional program
                 expressed in higher-order logic, our technique provides
                 the corresponding program for a functional language
                 defined with an operational semantics, and it provides
                 a mechanically checked theorem relating the two. This
                 theorem can then be used to transfer verified
                 properties of the logical function to the program. We
                 have implemented our technique in the HOL4 theorem
                 prover, translating functions to a core subset of
                 Standard ML, and have applied it to examples including
                 functional data structures, a parser generator,
                 cryptographic algorithms, and a garbage collector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Danielsson:2012:OSU,
  author =       "Nils Anders Danielsson",
  title =        "Operational semantics using the partiality monad",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "127--138",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364546",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The operational semantics of a partial, functional
                 language is often given as a relation rather than as a
                 function. The latter approach is arguably more natural:
                 if the language is functional, why not take advantage
                 of this when defining the semantics? One can
                 immediately see that a functional semantics is
                 deterministic and, in a constructive setting,
                 computable. This paper shows how one can use the
                 coinductive partiality monad to define big-step or
                 small-step operational semantics for lambda-calculi and
                 virtual machines as total, computable functions (total
                 definitional interpreters). To demonstrate that the
                 resulting semantics are useful type soundness and
                 compiler correctness results are also proved. The
                 results have been implemented and checked using Agda, a
                 dependently typed programming language and proof
                 assistant.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Olukotun:2012:HPE,
  author =       "Kunle Olukotun",
  title =        "High performance embedded domain specific languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "139--140",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364548",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today, all high-performance computer architectures are
                 parallel and heterogeneous; a combination of multiple
                 CPUs, GPUs and specialized processors. This creates a
                 complex programming problem for application developers.
                 Domain-specific languages (DSLs) are a promising
                 solution to this problem because they provide an avenue
                 for application-specific abstractions to be mapped
                 directly to low level architecture-specific programming
                 models providing high programmer productivity and high
                 execution performance. In this talk I will describe our
                 approach to building high performance DSLs, which is
                 based on embedding in Scala, light-weight modular
                 staging and a DSL infrastructure called Delite. I will
                 describe how we transform impure functional programs
                 into efficient first-order low-level code using domain
                 specific optimization, parallelism optimization,
                 locality optimization, scalar optimization, and
                 architecture-specific code generation. All
                 optimizations and transformations are implemented in an
                 extensible DSL compiler architecture that minimizes the
                 programmer effort required to develop a new DSL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Severi:2012:PTS,
  author =       "Paula G. Severi and Fer-Jan J. de Vries",
  title =        "Pure type systems with corecursion on streams: from
                 finite to infinitary normalisation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "141--152",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364550",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we use types for ensuring that programs
                 involving streams are well-behaved. We extend pure type
                 systems with a type constructor for streams, a modal
                 operator next and a fixed point operator for expressing
                 corecursion. This extension is called Pure Type Systems
                 with Corecursion (CoPTS). The typed lambda calculus for
                 reactive programs defined by Krishnaswami and Benton
                 can be obtained as a CoPTS. CoPTSs allow us to study a
                 wide range of typed lambda calculi extended with
                 corecursion using only one framework. In particular, we
                 study this extension for the calculus of constructions
                 which is the underlying formal language of Coq. We use
                 the machinery of infinitary rewriting and formalise the
                 idea of well-behaved programs using the concept of
                 infinitary normalisation. The set of finite and
                 infinite terms is defined as a metric completion. We
                 establish a precise connection between the modal
                 operator (o A ) and the metric at a syntactic level by
                 relating a variable of type (o A ) with the depth of
                 all its occurrences in a term. This syntactic
                 connection between the modal operator and the depth is
                 the key to the proofs of infinitary weak and strong
                 normalisation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Endrullis:2012:CES,
  author =       "J{\"o}rg Endrullis and Dimitri Hendriks and Rena
                 Bakhshi",
  title =        "On the complexity of equivalence of specifications of
                 infinite objects",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "153--164",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364551",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study the complexity of deciding the equality of
                 infinite objects specified by systems of equations, and
                 of infinite objects specified by $ \lambda $-terms. For
                 equational specifications there are several natural
                 notions of equality: equality in all models, equality
                 of the sets of solutions, and equality of normal forms
                 for productive specifications. For -$ \lambda $ terms
                 we investigate B{\"o}hm-tree equality and various
                 notions of observational equality. We pinpoint the
                 complexity of each of these notions in the arithmetical
                 or analytical hierarchy. We show that the complexity of
                 deciding equality in all models subsumes the entire
                 analytical hierarchy. This holds already for the most
                 simple infinite objects, viz. streams over $ \{ 0, 1 \}
                 $, and stands in sharp contrast to the low arithmetical
                 {$ \Pi^0_2 $}-completeness of equality of equationally
                 specified streams derived in [17] employing a different
                 notion of equality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Simoes:2012:AAA,
  author =       "Hugo Sim{\~o}es and Pedro Vasconcelos and M{\'a}rio
                 Florido and Steffen Jost and Kevin Hammond",
  title =        "Automatic amortised analysis of dynamic memory
                 allocation for lazy functional programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "165--176",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364575",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes the first successful attempt, of
                 which we are aware, to define an automatic, type-based
                 static analysis of resource bounds for lazy functional
                 programs. Our analysis uses the automatic amortisation
                 approach developed by Hofmann and Jost, which was
                 previously restricted to eager evaluation. In this
                 paper, we extend this work to a lazy setting by
                 capturing the costs of unevaluated expressions in type
                 annotations and by amortising the payment of these
                 costs using a notion of lazy potential. We present our
                 analysis as a proof system for predicting heap
                 allocations of a minimal functional language (including
                 higher-order functions and recursive data types) and
                 define a formal cost model based on Launchbury's
                 natural semantics for lazy evaluation. We prove the
                 soundness of our analysis with respect to the cost
                 model. Our approach is illustrated by a number of
                 representative and non-trivial examples that have been
                 analysed using a prototype implementation of our
                 analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Earl:2012:IPA,
  author =       "Christopher Earl and Ilya Sergey and Matthew Might and
                 David {Van Horn}",
  title =        "Introspective pushdown analysis of higher-order
                 programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "177--188",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364576",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the static analysis of functional programs,
                 pushdown flow analysis and abstract garbage collection
                 skirt just inside the boundaries of soundness and
                 decidability. Alone, each method reduces analysis times
                 and boosts precision by orders of magnitude. This work
                 illuminates and conquers the theoretical challenges
                 that stand in the way of combining the power of these
                 techniques. The challenge in marrying these techniques
                 is not subtle: computing the reachable control states
                 of a pushdown system relies on limiting access during
                 transition to the top of the stack; abstract garbage
                 collection, on the other hand, needs full access to the
                 entire stack to compute a root set, just as concrete
                 collection does. Introspective pushdown systems resolve
                 this conflict. Introspective pushdown systems provide
                 enough access to the stack to allow abstract garbage
                 collection, but they remain restricted enough to
                 compute control-state reachability, thereby enabling
                 the sound and precise product of pushdown analysis and
                 abstract garbage collection. Experiments reveal
                 synergistic interplay between the techniques, and the
                 fusion demonstrates ``better-than-both-worlds''
                 precision.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Launchbury:2012:ELT,
  author =       "John Launchbury and Iavor S. Diatchki and Thomas
                 DuBuisson and Andy Adams-Moran",
  title =        "Efficient lookup-table protocol in secure multiparty
                 computation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "189--200",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364556",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Secure multiparty computation (SMC) permits a
                 collection of parties to compute a collaborative
                 result, without any of the parties gaining any
                 knowledge about the inputs provided by other parties.
                 Specifications for SMC are commonly presented as
                 boolean circuits, where optimizations come mostly from
                 reducing the number of multiply-operations (including
                 and -gates) --- these are the operations which incur
                 significant cost, either in computation overhead or in
                 communication between the parties. Instead, we take a
                 language-oriented approach, and consequently are able
                 to explore many other kinds of optimizations. We
                 present an efficient and general purpose SMC
                 table-lookup algorithm that can serve as a direct
                 alternative to circuits. Looking up a private (i.e.
                 shared, or encrypted) n -bit argument in a public table
                 requires log(n) parallel-and operations. We use the
                 advanced encryption standard algorithm (AES) as a
                 driving motivation, and by introducing different kinds
                 of parallelization techniques, produce the fastest
                 current SMC implementation of AES, improving the best
                 previously reported results by well over an order of
                 magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Stefan:2012:ACT,
  author =       "Deian Stefan and Alejandro Russo and Pablo Buiras and
                 Amit Levy and John C. Mitchell and David Mazi{\'e}res",
  title =        "Addressing covert termination and timing channels in
                 concurrent information flow systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "201--214",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364557",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When termination of a program is observable by an
                 adversary, confidential information may be leaked by
                 terminating accordingly. While this termination covert
                 channel has limited bandwidth for sequential programs,
                 it is a more dangerous source of information leakage in
                 concurrent settings. We address concurrent termination
                 and timing channels by presenting a dynamic
                 information-flow control system that mitigates and
                 eliminates these channels while allowing termination
                 and timing to depend on secret values. Intuitively, we
                 leverage concurrency by placing such potentially
                 sensitive actions in separate threads. While
                 termination and timing of these threads may expose
                 secret values, our system requires any thread observing
                 these properties to raise its information-flow label
                 accordingly, preventing leaks to lower-labeled
                 contexts. We implement this approach in a Haskell
                 library and demonstrate its applicability by building a
                 web server that uses information-flow control to
                 restrict untrusted web applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{zuSiederdissen:2012:SAC,
  author =       "Christian H{\"o}ner zu Siederdissen",
  title =        "Sneaking around {concatMap}: efficient combinators for
                 dynamic programming",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "215--226",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364559",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a framework of dynamic programming
                 combinators that provides a high-level environment to
                 describe the recursions typical of dynamic programming
                 over sequence data in a style very similar to algebraic
                 dynamic programming (ADP). Using a combination of
                 type-level programming and stream fusion leads to a
                 substantial increase in performance, without
                 sacrificing much of the convenience and theoretical
                 underpinnings of ADP. We draw examples from the field
                 of computational biology, more specifically RNA
                 secondary structure prediction, to demonstrate how to
                 use these combinators and what differences exist
                 between this library, ADP, and other approaches. The
                 final version of the combinator library allows writing
                 algorithms with performance close to hand-optimized C
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Daniels:2012:ERH,
  author =       "Noah M. Daniels and Andrew Gallant and Norman Ramsey",
  title =        "Experience report: {Haskell} in computational
                 biology",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "227--234",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364560",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Haskell gives computational biologists the flexibility
                 and rapid prototyping of a scripting language, plus the
                 performance of native code. In our experience,
                 higher-order functions, lazy evaluation, and monads
                 really worked, but profiling and debugging presented
                 obstacles. Also, Haskell libraries vary greatly:
                 memoization combinators and parallel-evaluation
                 strategies helped us a lot, but other, nameless
                 libraries mostly got in our way. Despite the obstacles
                 and the uncertain quality of some libraries, Haskell's
                 ecosystem made it easy for us to develop new algorithms
                 in computational biology.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Foltzer:2012:MSP,
  author =       "Adam Foltzer and Abhishek Kulkarni and Rebecca Swords
                 and Sajith Sasidharan and Eric Jiang and Ryan Newton",
  title =        "A meta-scheduler for the par-monad: composable
                 scheduling for the heterogeneous cloud",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "235--246",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364562",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern parallel computing hardware demands
                 increasingly specialized attention to the details of
                 scheduling and load balancing across heterogeneous
                 execution resources that may include GPU and cloud
                 environments, in addition to traditional CPUs. Many
                 existing solutions address the challenges of particular
                 resources, but do so in isolation, and in general do
                 not compose within larger systems. We propose a
                 general, composable abstraction for execution
                 resources, along with a continuation-based
                 meta-scheduler that harnesses those resources in the
                 context of a deterministic parallel programming library
                 for Haskell. We demonstrate performance benefits of
                 combined CPU/GPU scheduling over either alone, and of
                 combined multithreaded/distributed scheduling over
                 existing distributed programming approaches for
                 Haskell.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Bergstrom:2012:NDP,
  author =       "Lars Bergstrom and John Reppy",
  title =        "Nested data-parallelism on the {GPU}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "247--258",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364563",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphics processing units (GPUs) provide both memory
                 bandwidth and arithmetic performance far greater than
                 that available on CPUs but, because of their
                 Single-Instruction-Multiple-Data (SIMD) architecture,
                 they are hard to program. Most of the programs ported
                 to GPUs thus far use traditional data-level
                 parallelism, performing only operations that operate
                 uniformly over vectors. NESL is a first-order
                 functional language that was designed to allow
                 programmers to write irregular-parallel programs ---
                 such as parallel divide-and-conquer algorithms --- for
                 wide-vector parallel computers. This paper presents our
                 port of the NESL implementation to work on GPUs and
                 provides empirical evidence that nested
                 data-parallelism (NDP) on GPUs significantly
                 outperforms CPU-based implementations and matches or
                 beats newer GPU languages that support only flat
                 parallelism. While our performance does not match that
                 of hand-tuned CUDA programs, we argue that the
                 notational conciseness of NESL is worth the loss in
                 performance. This work provides the first language
                 implementation that directly supports NDP on a GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Lippmeier:2012:WEH,
  author =       "Ben Lippmeier and Manuel M. T. Chakravarty and
                 Gabriele Keller and Roman Leshchinskiy and Simon Peyton
                 Jones",
  title =        "Work efficient higher-order vectorisation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "259--270",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364564",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Existing approaches to higher-order vectorisation,
                 also known as flattening nested data parallelism, do
                 not preserve the asymptotic work complexity of the
                 source program. Straightforward examples, such as
                 sparse matrix-vector multiplication, can suffer a
                 severe blow-up in both time and space, which limits the
                 practicality of this method. We discuss why this
                 problem arises, identify the mis-handling of index
                 space transforms as the root cause, and present a
                 solution using a refined representation of nested
                 arrays. We have implemented this solution in Data
                 Parallel Haskell (DPH) and present benchmarks showing
                 that realistic programs, which used to suffer the
                 blow-up, now have the correct asymptotic work
                 complexity. In some cases, the asymptotic complexity of
                 the vectorised program is even better than the
                 original.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Sewell:2012:TJ,
  author =       "Peter Sewell",
  title =        "Tales from the jungle",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "271--272",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364566",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We rely on a computational infrastructure that is a
                 densely interwined mass of software and hardware:
                 programming languages, network protocols, operating
                 systems, and processors. It has accumulated great
                 complexity, from a combination of engineering design
                 decisions, contingent historical choices, and sheer
                 scale, yet it is defined at best by prose
                 specifications, or, all too often, just by the common
                 implementations. Can we do better? More specifically,
                 can we apply rigorous methods to this mainstream
                 infrastructure, taking the accumulated complexity
                 seriously, and if we do, does it help? My colleagues
                 and I have looked at these questions in several
                 contexts: the TCP/IP network protocols with their
                 Sockets API; programming language design, including the
                 Java module system and the C11/C++11 concurrency model;
                 the hardware concurrency behaviour of x86, IBM POWER,
                 and ARM multiprocessors; and compilation of concurrent
                 code. In this talk I will draw some lessons from what
                 did and did not succeed, looking especially at the
                 empirical nature of some of the work, at the social
                 process of engagement with the various different
                 communities, and at the mathematical and software tools
                 we used. Domain-specific modelling languages (based on
                 functional programming ideas) and proof assistants were
                 invaluable for working with the large and loose
                 specifications involved: idioms within HOL4 for TCP,
                 our Ott tool for programming language specification,
                 and Owens's Lem tool for portable semantic definitions,
                 with HOL4, Isabelle, and Coq, for the relaxed-memory
                 concurrency semantics work. Our experience with these
                 suggests something of what is needed to make full-scale
                 rigorous semantics a commonplace reality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Wadler:2012:PS,
  author =       "Philip Wadler",
  title =        "Propositions as sessions",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "273--286",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364568",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Continuing a line of work by Abramsky (1994), by
                 Bellin and Scott (1994), and by Caires and Pfenning
                 (2010), among others, this paper presents CP, a
                 calculus in which propositions of classical linear
                 logic correspond to session types. Continuing a line of
                 work by Honda (1993), by Honda, Kubo, and Vasconcelos
                 (1998), and by Gay and Vasconcelos (2010), among
                 others, this paper presents GV, a linear functional
                 language with session types, and presents a translation
                 from GV into CP. The translation formalises for the
                 first time a connection between a standard presentation
                 of session types and linear logic, and shows how a
                 modification to the standard presentation yield a
                 language free from deadlock, where deadlock freedom
                 follows from the correspondence to linear logic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Henry:2012:TUM,
  author =       "Gr{\'e}goire Henry and Michel Mauny and Emmanuel
                 Chailloux and Pascal Manoury",
  title =        "Typing unmarshalling without marshalling types",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "287--298",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Unmarshalling primitives in statically typed language
                 require, in order to preserve type safety, to
                 dynamically verify the compatibility between the
                 incoming values and the statically expected type. In
                 the context of programming languages based on
                 parametric polymorphism and uniform data
                 representation, we propose a relation of compatibility
                 between (unmarshalled) memory graphs and types. It is
                 defined as constraints over nodes of the memory graph.
                 Then, we propose an algorithm to check the
                 compatibility between a memory graph and a type. It is
                 described as a constraint solver based on a rewriting
                 system. We have shown that the proposed algorithm is
                 sound and semi-complete in presence of algebraic data
                 types, mutable data, polymorphic sharing, cycles, and
                 functional values, however, in its general form, it may
                 not terminate. We have implemented a prototype tailored
                 for the OCaml compiler [17] that always terminates and
                 still seems sufficiently complete in practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Jones:2012:DD,
  author =       "Will Jones and Tony Field and Tristan Allwood",
  title =        "Deconstraining {DSLs}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "299--310",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364571",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Strongly-typed functional languages provide a powerful
                 framework for embedding Domain-Specific Languages
                 (DSLs). However, building type-safe functions defined
                 over an embedded DSL can introduce application-specific
                 type constraints that end up being imposed on the DSL
                 data types themselves. At best, these constraints are
                 unwieldy and at worst they can limit the range of DSL
                 expressions that can be built. We present a simple
                 solution to this problem that allows
                 application-specific constraints to be specified at the
                 point of use of a DSL expression rather than when the
                 DSL's embedding types are defined. Our solution applies
                 equally to both tagged and tagless representations and,
                 importantly, also works in the presence of higher-rank
                 types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Mainland:2012:EHM,
  author =       "Geoffrey Mainland",
  title =        "Explicitly heterogeneous metaprogramming with
                 {MetaHaskell}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "311--322",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364572",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Languages with support for metaprogramming, like
                 MetaOCaml, offer a principled approach to code
                 generation by guaranteeing that well-typed metaprograms
                 produce well-typed programs. However, many problem
                 domains where metaprogramming can fruitfully be applied
                 require generating code in languages like C, CUDA, or
                 assembly. Rather than resorting to add-hoc code
                 generation techniques, these applications should be
                 directly supported by explicitly heterogeneous
                 metaprogramming languages. We present MetaHaskell, an
                 extension of Haskell 98 that provides modular syntactic
                 and type system support for type safe metaprogramming
                 with multiple object languages. Adding a new object
                 language to MetaHaskell requires only minor
                 modifications to the host language to support
                 type-level quantification over object language types
                 and propagation of type equality constraints. We
                 demonstrate the flexibility of our approach through
                 three object languages: a core ML language, a linear
                 variant of the core ML language, and a subset of C. All
                 three languages support metaprogramming with open terms
                 and guarantee that well-typed MetaHaskell programs will
                 only produce closed object terms that are well-typed.
                 The essence of MetaHaskell is captured in a type system
                 for a simplified metalanguage. MetaHaskell, as well as
                 all three object languages, are fully implemented in
                 the mhc bytecode compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Axelsson:2012:GAS,
  author =       "Emil Axelsson",
  title =        "A generic abstract syntax model for embedded
                 languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "323--334",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364573",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Representing a syntax tree using a data type often
                 involves having many similar-looking constructors.
                 Functions operating on such types often end up having
                 many similar-looking cases. Different languages often
                 make use of similar-looking constructions. We propose a
                 generic model of abstract syntax trees capable of
                 representing a wide range of typed languages. Syntactic
                 constructs can be composed in a modular fashion
                 enabling reuse of abstract syntax and syntactic
                 processing within and across languages. Building on
                 previous methods of encoding extensible data types in
                 Haskell, our model is a pragmatic solution to Wadler's
                 ``expression problem''. Its practicality has been
                 confirmed by its use in the implementation of the
                 embedded language Feldspar.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Pike:2012:ERD,
  author =       "Lee Pike and Nis Wegmann and Sebastian Niller and
                 Alwyn Goodloe",
  title =        "Experience report: a do-it-yourself high-assurance
                 compiler",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "335--340",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364553",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Embedded domain-specific languages (EDSLs) are an
                 approach for quickly building new languages while
                 maintaining the advantages of a rich metalanguage. We
                 argue in this experience report that the ``EDSL
                 approach'' can surprisingly ease the task of building a
                 high-assurance compiler. We do not strive to build a
                 fully formally-verified tool-chain, but take a
                 ``do-it-yourself'' approach to increase our confidence
                 in compiler-correctness without too much effort.
                 Copilot is an EDSL developed by Galois, Inc. and the
                 National Institute of Aerospace under contract to NASA
                 for the purpose of runtime monitoring of
                 flight-critical avionics. We report our experience in
                 using type-checking, QuickCheck, and model-checking
                 ``off-the-shelf'' to quickly increase confidence in our
                 EDSL tool-chain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Vytiniotis:2012:EPD,
  author =       "Dimitrios Vytiniotis and Simon Peyton Jones and
                 Jos{\'e} Pedro Magalh{\~a}es",
  title =        "Equality proofs and deferred type errors: a compiler
                 pearl",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "341--352",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364554",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Glasgow Haskell Compiler is an optimizing compiler
                 that expresses and manipulates first-class equality
                 proofs in its intermediate language. We describe a
                 simple, elegant technique that exploits these equality
                 proofs to support deferred type errors. The technique
                 requires us to treat equality proofs as
                 possibly-divergent terms; we show how to do so without
                 losing either soundness or the zero-overhead cost model
                 that the programmer expects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Neatherway:2012:TBA,
  author =       "Robin P. Neatherway and Steven J. Ramsay and Chih-Hao
                 Luke Ong",
  title =        "A traversal-based algorithm for higher-order model
                 checking",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "353--364",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364578",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Higher-order model checking --- the model checking of
                 trees generated by higher-order recursion schemes
                 (HORS) --- is a natural generalisation of finite-state
                 and pushdown model checking. Recent work has shown that
                 it can serve as a basis for software model checking for
                 functional languages such as ML and Haskell. In this
                 paper, we introduce higher-order recursion schemes with
                 cases (HORSC), which extend HORS with a
                 definition-by-cases construct (to express program
                 branching based on data) and non-determinism (to
                 express abstractions of behaviours). This paper is a
                 study of the universal HORSC model checking problem for
                 deterministic trivial automata: does the automaton
                 accept every tree in the tree language generated by the
                 given HORSC? We first characterise the model checking
                 problem by an intersection type system extended with a
                 carefully restricted form of union types. We then
                 present an algorithm for deciding the model checking
                 problem, which is based on the notion of traversals
                 induced by the fully abstract game semantics of these
                 schemes, but presented as a goal-directed construction
                 of derivations in the intersection and union type
                 system. We view HORSC model checking as a suitable
                 backend engine for an approach to verifying functional
                 programs. We have implemented the algorithm in a tool
                 called TravMC, and demonstrated its effectiveness on a
                 test suite of programs, including abstract models of
                 functional programs obtained via an
                 abstraction-refinement procedure from pattern-matching
                 recursion schemes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Perera:2012:FPE,
  author =       "Roly Perera and Umut A. Acar and James Cheney and Paul
                 Blain Levy",
  title =        "Functional programs that explain their work",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "9",
  pages =        "365--376",
  month =        sep,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398856.2364579",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:19 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present techniques that enable higher-order
                 functional computations to ``explain'' their work by
                 answering questions about how parts of their output
                 were calculated. As explanations, we consider the
                 traditional notion of program slices, which we show can
                 be inadequate, and propose a new notion: trace slices.
                 We present techniques for specifying flexible and rich
                 slicing criteria based on partial expressions, parts of
                 which have been replaced by holes. We characterise
                 program slices in an algorithm-independent fashion and
                 show that a least slice for a given criterion exists.
                 We then present an algorithm, called unevaluation, for
                 computing least program slices from computations
                 reified as traces. Observing a limitation of program
                 slices, we develop a notion of trace slice as another
                 form of explanation and present an algorithm for
                 computing them. The unevaluation algorithm can be
                 applied to any subtrace of a trace slice to compute a
                 program slice whose evaluation generates that subtrace.
                 This close correspondence between programs, traces, and
                 their slices can enable the programmer to understand a
                 computation interactively, in terms of the programming
                 language in which the computation is expressed. We
                 present an implementation in the form of a tool,
                 discuss some important practical implementation
                 concerns and present some techniques for addressing
                 them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '12 conference proceedings.",
}

@Article{Suenaga:2012:TBS,
  author =       "Kohei Suenaga and Ryota Fukuda and Atsushi Igarashi",
  title =        "Type-based safe resource deallocation for
                 shared-memory concurrency",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "1--20",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384618",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a type system to guarantee safe resource
                 deallocation for shared-memory concurrent programs by
                 extending the previous type system based on fractional
                 ownerships. Here, safe resource deallocation means that
                 memory cells, locks, or threads are not left allocated
                 when a program terminates. Our framework supports (1)
                 fork/join parallelism, (2) synchronization with locks,
                 and (3) dynamically allocated memory cells and locks.
                 The type system is proved to be sound. We also provide
                 a type inference algorithm for the type system and a
                 prototype implementation of the algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Gordon:2012:URI,
  author =       "Colin S. Gordon and Matthew J. Parkinson and Jared
                 Parsons and Aleks Bromfield and Joe Duffy",
  title =        "Uniqueness and reference immutability for safe
                 parallelism",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "21--40",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384619",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A key challenge for concurrent programming is that
                 side-effects (memory operations) in one thread can
                 affect the behavior of another thread. In this paper,
                 we present a type system to restrict the updates to
                 memory to prevent these unintended side-effects. We
                 provide a novel combination of immutable and unique
                 (isolated) types that ensures safe parallelism (race
                 freedom and deterministic execution). The type system
                 includes support for polymorphism over type qualifiers,
                 and can easily create cycles of immutable objects. Key
                 to the system's flexibility is the ability to recover
                 immutable or externally unique references after
                 violating uniqueness without any explicit alias
                 tracking. Our type system models a prototype extension
                 to C\# that is in active use by a Microsoft team. We
                 describe their experiences building large systems with
                 this extension. We prove the soundness of the type
                 system by an embedding into a program logic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Sreeram:2012:SCD,
  author =       "Jaswanth Sreeram and Santosh Pande",
  title =        "Safe compiler-driven transaction checkpointing and
                 recovery",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "41--56",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384620",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Several studies have shown that a large fraction of
                 the work performed inside memory transactions in
                 representative programs is wasted due to the
                 transaction experiencing a conflict and aborting.
                 Aborts inside long running transactions are especially
                 influential to performance and the simplicity of the TM
                 programming model (relative to using finegrained
                 locking) in synchronizing large critical sections means
                 that large transactions are common and this exacerbates
                 the problem of wasted work. In this paper we present a
                 practical transaction checkpoint and recovery scheme in
                 which transactions that experience a conflict can
                 restore their state (including the local context in
                 which they were executing) to some dynamic program
                 point before this access and begin execution from that
                 point. This state saving and restoration is implemented
                 by checkpoint operations that are generated by a
                 compiler into the transaction's body and are also
                 optimized to reduce the amount of state that is saved
                 and restored. We also describe a runtime system that
                 manages these checkpointed states and orchestrates the
                 restoration of the right checkpointed state for a
                 conflict on a particular transactional access. Moreover
                 the synthesis of these save and restore operations,
                 their optimization and invocation at runtime are
                 completely transparent to the programmer. We have
                 implemented the checkpoint generation and optimization
                 scheme in the LLVM compiler and runtime support for the
                 TL2 STM system. Our experiments indicate that for many
                 parallel programs using such checkpoint recovery
                 schemes can result in upto several orders of magnitude
                 reduction in number of aborts and significant execution
                 time speedups relative to plain transactional programs
                 for the same number of threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Muller:2012:TPS,
  author =       "Stefan Muller and Stephen Chong",
  title =        "Towards a practical secure concurrent language",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "57--74",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384621",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We demonstrate that a practical concurrent language
                 can be extended in a natural way with information
                 security mechanisms that provably enforce strong
                 information security guarantees. We extend the X10
                 concurrent programming language with coarse-grained
                 information-flow control. Central to X10 concurrency
                 abstractions is the notion of a place: a container for
                 data and computation. We associate a security level
                 with each place, and restrict each place to store only
                 data appropriate for that security level. When places
                 interact only with other places at the same security
                 level, then our security mechanisms impose no
                 restrictions. When places of differing security levels
                 interact, our information security analysis prevents
                 potentially dangerous information flows, including
                 information flow through covert scheduling channels.
                 The X10 concurrency mechanisms simplify reasoning about
                 information flow in concurrent programs. We present a
                 static analysis that enforces a noninterference-based
                 extensional information security condition in a
                 calculus that captures the key aspects of X10's place
                 abstraction and async-finish parallelism. We extend
                 this security analysis to support many of X10's
                 language features, and have implemented a prototype
                 compiler for the resulting language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Parizek:2012:PAJ,
  author =       "Pavel Par{\'\i}zek and OndYej Lhot{\'a}k",
  title =        "Predicate abstraction of {Java} programs with
                 collections",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "75--94",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384623",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Our goal is to develop precise and scalable
                 verification techniques for Java programs that use
                 collections and properties that depend on their
                 content. We apply the popular approach of predicate
                 abstraction to Java programs and collections. The main
                 challenge in this context is precise and compact
                 modeling of collections that enables practical
                 verification. We define a predicate language for
                 modeling the observable state of Java collections at
                 the interface level. Changes of the state by API
                 methods are captured by weakest preconditions. We adapt
                 existing techniques for construction of abstract
                 programs. Most notably, we designed optimizations based
                 on specific features of the predicate language. We
                 evaluated our approach on Java programs that use
                 collections in advanced ways. Our results show that
                 interesting properties, such as consistency between
                 multiple collections, can be verified using our
                 approach. The properties are specified using logic
                 formulas that involve predicates introduced by our
                 language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Schiller:2012:RBW,
  author =       "Todd W. Schiller and Michael D. Ernst",
  title =        "Reducing the barriers to writing verified
                 specifications",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "95--112",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384624",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Formally verifying a program requires significant
                 skill not only because of complex interactions between
                 program subcomponents, but also because of deficiencies
                 in current verification interfaces. These skill
                 barriers make verification economically unattractive by
                 preventing the use of less-skilled (less-expensive)
                 workers and distributed workflows (i.e.,
                 crowdsourcing). This paper presents VeriWeb, a
                 web-based IDE for verification that decomposes the task
                 of writing verifiable specifications into manageable
                 subproblems. To overcome the information loss caused by
                 task decomposition, and to reduce the skill required to
                 verify a program, VeriWeb incorporates several
                 innovative user interface features: drag and drop
                 condition construction, concrete counterexamples, and
                 specification inlining. To evaluate VeriWeb, we
                 performed three experiments. First, we show that
                 VeriWeb lowers the time and monetary cost of
                 verification by performing a comparative study of
                 VeriWeb and a traditional tool using 14 paid subjects
                 contracted hourly from Exhedra Solution's vWorker
                 online marketplace. Second, we demonstrate the dearth
                 and insufficiency of current ad-hoc labor marketplaces
                 for verification by recruiting workers from Amazon's
                 Mechanical Turk to perform verification with VeriWeb.
                 Finally, we characterize the minimal communication
                 overhead incurred when VeriWeb is used collaboratively
                 by observing two pairs of developers each use the tool
                 simultaneously to verify a single program.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Betts:2012:GVG,
  author =       "Adam Betts and Nathan Chong and Alastair Donaldson and
                 Shaz Qadeer and Paul Thomson",
  title =        "{GPUVerify}: a verifier for {GPU} kernels",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "113--132",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384625",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a technique for verifying race- and
                 divergence-freedom of GPU kernels that are written in
                 mainstream kernel programming languages such as OpenCL
                 and CUDA. Our approach is founded on a novel formal
                 operational semantics for GPU programming termed
                 synchronous, delayed visibility (SDV) semantics. The
                 SDV semantics provides a precise definition of barrier
                 divergence in GPU kernels and allows kernel
                 verification to be reduced to analysis of a sequential
                 program, thereby completely avoiding the need to reason
                 about thread interleavings, and allowing existing
                 modular techniques for program verification to be
                 leveraged. We describe an efficient encoding for data
                 race detection and propose a method for automatically
                 inferring loop invariants required for verification. We
                 have implemented these techniques as a practical
                 verification tool, GPUVerify, which can be applied
                 directly to OpenCL and CUDA source code. We evaluate
                 GPUVerify with respect to a set of 163 kernels drawn
                 from public and commercial sources. Our evaluation
                 demonstrates that GPUVerify is capable of efficient,
                 automatic verification of a large number of real-world
                 kernels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Logozzo:2012:MVA,
  author =       "Francesco Logozzo and Thomas Ball",
  title =        "Modular and verified automatic program repair",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "133--146",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384626",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study the problem of suggesting code repairs at
                 design time, based on the warnings issued by modular
                 program verifiers. We introduce the concept of a
                 verified repair, a change to a program's source that
                 removes bad execution traces while increasing the
                 number of good traces, where the bad/good traces form a
                 partition of all the traces of a program. Repairs are
                 property-specific. We demonstrate our framework in the
                 context of warnings produced by the modular cccheck
                 (a.k.a. Clousot) abstract interpreter, and generate
                 repairs for missing contracts, incorrect locals and
                 objects initialization, wrong conditionals, buffer
                 overruns, arithmetic overflow and incorrect floating
                 point comparisons. We report our experience with
                 automatically generating repairs for the {.NET}
                 framework libraries, generating verified repairs for
                 over 80\% of the warnings generated by cccheck.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Kulkarni:2012:MCO,
  author =       "Sameer Kulkarni and John Cavazos",
  title =        "Mitigating the compiler optimization phase-ordering
                 problem using machine learning",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "147--162",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384628",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today's compilers have a plethora of optimizations to
                 choose from, and the correct choice of optimizations
                 can have a significant impact on the performance of the
                 code being optimized. Furthermore, choosing the correct
                 order in which to apply those optimizations has been a
                 long standing problem in compilation research. Each of
                 these optimizations interacts with the code and in turn
                 with all other optimizations in complicated ways.
                 Traditional compilers typically apply the same set of
                 optimization in a fixed order to all functions in a
                 program, without regard the code being optimized.
                 Understanding the interactions of optimizations is very
                 important in determining a good solution to the
                 phase-ordering problem. This paper develops a new
                 approach that automatically selects good optimization
                 orderings on a per method basis within a dynamic
                 compiler. Our approach formulates the phase-ordering
                 problem as a Markov process and uses a characterization
                 of the current state of the code being optimized to
                 creating a better solution to the phase ordering
                 problem. Our technique uses neuro-evolution to
                 construct an artificial neural network that is capable
                 of predicting beneficial optimization ordering for a
                 piece of code that is being optimized. We implemented
                 our technique in Jikes RVM and achieved significant
                 improvements on a set of standard Java benchmarks over
                 a well-engineered fixed order.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{St-Amour:2012:OCO,
  author =       "Vincent St-Amour and Sam Tobin-Hochstadt and Matthias
                 Felleisen",
  title =        "Optimization coaching: optimizers learn to communicate
                 with programmers",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "163--178",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384629",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Optimizing compilers map programs in high-level
                 languages to high-performance target language code. To
                 most programmers, such a compiler constitutes an
                 impenetrable black box whose inner workings are beyond
                 their understanding. Since programmers often must
                 understand the workings of their compilers to achieve
                 their desired performance goals, they typically resort
                 to various forms of reverse engineering, such as
                 examining compiled code or intermediate forms. Instead,
                 optimizing compilers should engage programmers in a
                 dialog. This paper introduces one such possible form of
                 dialog: optimization coaching. An optimization coach
                 watches while a program is compiled, analyzes the
                 results, generates suggestions for enabling further
                 compiler optimization in the source program, and
                 presents a suitable synthesis of its results to the
                 programmer. We present an evaluation based on case
                 studies, which illustrate how an optimization coach can
                 help programmers achieve optimizations resulting in
                 substantial performance improvements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Inoue:2012:AML,
  author =       "Hiroshi Inoue and Hiroshige Hayashizaki and Peng Wu
                 and Toshio Nakatani",
  title =        "Adaptive multi-level compilation in a trace-based
                 {Java JIT} compiler",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "179--194",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384630",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes our multi-level compilation
                 techniques implemented in a trace-based Java JIT
                 compiler (trace-JIT). Like existing multi-level
                 compilation for method-based compilers, we start JIT
                 compilation with a small compilation scope and a low
                 optimization level so the program can start running
                 quickly. Then we identify hot paths with a timer-based
                 sampling profiler, generate long traces that capture
                 the hot paths, and recompile them with a high
                 optimization level to improve the peak performance. A
                 key to high performance is selecting long traces that
                 effectively capture the entire hot paths for upgrade
                 recompilations. To do this, we introduce a new
                 technique to generate a directed graph representing the
                 control flow, a TTgraph, and use the TTgraph in the
                 trace selection engine to efficiently select long
                 traces. We show that our multi-level compilation
                 improves the peak performance of programs by up to
                 58.5\% and 22.2\% on average compared to compiling all
                 of the traces only at a low optimization level.
                 Comparing the performance with our multi-level
                 compilation to the performance when compiling all of
                 the traces at a high optimization level, our technique
                 can reduce the startup times of programs by up to
                 61.1\% and 31.3\% on average without significant
                 reduction in the peak performance. Our results show
                 that our adaptive multi-level compilation can balance
                 the peak performance and startup time by taking
                 advantage of different optimization levels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Castanos:2012:BPE,
  author =       "Jose Castanos and David Edelsohn and Kazuaki Ishizaki
                 and Priya Nagpurkar and Toshio Nakatani and Takeshi
                 Ogasawara and Peng Wu",
  title =        "On the benefits and pitfalls of extending a statically
                 typed language {JIT} compiler for dynamic scripting
                 languages",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "195--212",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384631",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Whenever the need to compile a new dynamically typed
                 language arises, an appealing option is to repurpose an
                 existing statically typed language Just-In-Time (JIT)
                 compiler (repurposed JIT compiler). Existing repurposed
                 JIT compilers (RJIT compilers), however, have not yet
                 delivered the hoped-for performance boosts. The
                 performance of JVM languages, for instance, often lags
                 behind standard interpreter implementations. Even more
                 customized solutions that extend the internals of a JIT
                 compiler for the target language compete poorly with
                 those designed specifically for dynamically typed
                 languages. Our own Fiorano JIT compiler is an example
                 of this problem. As a state-of-the-art, RJIT compiler
                 for Python, the Fiorano JIT compiler outperforms two
                 other RJIT compilers (Unladen Swallow and Jython), but
                 still shows a noticeable performance gap compared to
                 PyPy, today's best performing Python JIT compiler. In
                 this paper, we discuss techniques that have proved
                 effective in the Fiorano JIT compiler as well as
                 limitations of our current implementation. More
                 importantly, this work offers the first in-depth look
                 at benefits and limitations of the repurposed JIT
                 compiler approach. We believe the most common pitfall
                 of existing RJIT compilers is not focusing sufficiently
                 on specialization, an abundant optimization opportunity
                 unique to dynamically typed languages. Unfortunately,
                 the lack of specialization cannot be overcome by
                 applying traditional optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Cousot:2012:AIFb,
  author =       "Patrick M. Cousot and Radhia Cousot and Francesco
                 Logozzo and Michael Barnett",
  title =        "An abstract interpretation framework for refactoring
                 with application to extract methods with contracts",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "213--232",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384633",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Method extraction is a common refactoring feature
                 provided by most modern IDEs. It replaces a
                 user-selected piece of code with a call to an
                 automatically generated method. We address the problem
                 of automatically inferring contracts (precondition,
                 postcondition) for the extracted method. We require the
                 inferred contract: (a) to be valid for the extracted
                 method (validity); (b) to guard the language and
                 programmer assertions in the body of the extracted
                 method by an opportune precondition (safety); (c) to
                 preserve the proof of correctness of the original code
                 when analyzing the new method separately
                 (completeness); and (d) to be the most general possible
                 (generality). These requirements rule out trivial
                 solutions (e.g., inlining, projection, etc). We propose
                 two theoretical solutions to the problem. The first one
                 is simple and optimal. It is valid, safe, complete and
                 general but unfortunately not effectively computable
                 (except for unrealistic finiteness/decidability
                 hypotheses). The second one is based on an iterative
                 forward/backward method. We show it to be valid, safe,
                 and, under reasonable assumptions, complete and
                 general. We prove that the second solution subsumes the
                 first. All justifications are provided with respect to
                 a new, set-theoretic version of Hoare logic (hence
                 without logic), and abstractions of Hoare logic,
                 revisited to avoid surprisingly unsound inference
                 rules. We have implemented the new algorithms on the
                 top of two industrial-strength tools (CCCheck and the
                 Microsoft Roslyn CTP). Our experience shows that the
                 analysis is both fast enough to be used in an
                 interactive environment and precise enough to generate
                 good annotations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Zhang:2012:RAJ,
  author =       "Ying Zhang and Gang Huang and Xuanzhe Liu and Wei
                 Zhang and Hong Mei and Shunxiang Yang",
  title =        "Refactoring {Android Java} code for on-demand
                 computation offloading",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "233--248",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384634",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computation offloading is a promising way to improve
                 the performance as well as reducing the battery power
                 consumption of a smartphone application by executing
                 some parts of the application on a remote server.
                 Supporting such capability is not easy for smartphone
                 application developers due to (1) correctness: some
                 code, e.g., that for GPS, gravity, and other sensors,
                 can run only on the smartphone so that developers have
                 to identify which parts of the application cannot be
                 offloaded; (2) effectiveness: the reduced execution
                 time must be greater than the network delay caused by
                 computation offloading so that developers need to
                 calculate which parts are worth offloading; (3)
                 adaptability: smartphone applications often face
                 changes of user requirements and runtime environments
                 so that developers need to implement the adaptation on
                 offloading. More importantly, considering the large
                 number of today's smartphone applications, solutions
                 applicable for legacy applications will be much more
                 valuable. In this paper, we present a tool, named
                 DPartner, that automatically refactors Android
                 applications to be the ones with computation offloading
                 capability. For a given Android application, DPartner
                 first analyzes its bytecode for discovering the parts
                 worth offloading, then rewrites the bytecode to
                 implement a special program structure supporting
                 on-demand offloading, and finally generates two
                 artifacts to be deployed onto an Android phone and the
                 server, respectively. We evaluated DPartner on three
                 real-world Android applications, demonstrating the
                 reduction of execution time by 46\%-97\% and battery
                 power consumption by 27\%-83\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Hayden:2012:KEG,
  author =       "Christopher M. Hayden and Edward K. Smith and Michail
                 Denchev and Michael Hicks and Jeffrey S. Foster",
  title =        "{Kitsune}: efficient, general-purpose dynamic software
                 updating for {C}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "249--264",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384635",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic software updating (DSU) systems allow programs
                 to be updated while running, thereby permitting
                 developers to add features and fix bugs without
                 downtime. This paper introduces Kitsune, a new DSU
                 system for C whose design has three notable features.
                 First, Kitsune's updating mechanism updates the whole
                 program, not individual functions. This mechanism is
                 more flexible than most prior approaches and places no
                 restrictions on data representations or allowed
                 compiler optimizations. Second, Kitsune makes the
                 important aspects of updating explicit in the program
                 text, making the program's semantics easy to understand
                 while minimizing programmer effort. Finally, the
                 programmer can write simple specifications to direct
                 Kitsune to generate code that traverses and transforms
                 old-version state for use by new code; such state
                 transformation is often necessary, and is significantly
                 more difficult in prior DSU systems. We have used
                 Kitsune to update five popular, open-source, single-
                 and multi-threaded programs, and find that few program
                 changes are required to use Kitsune, and that it incurs
                 essentially no performance overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Magill:2012:AOT,
  author =       "Stephen Magill and Michael Hicks and Suriya
                 Subramanian and Kathryn S. McKinley",
  title =        "Automating object transformations for dynamic software
                 updating",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "265--280",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384636",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic software updating (DSU) systems eliminate
                 costly downtime by dynamically fixing bugs and adding
                 features to executing programs. Given a static code
                 patch, most DSU systems construct runtime code changes
                 automatically. However, a dynamic update must also
                 specify how to change the running program's execution
                 state, e.g., the stack and heap, to make it compatible
                 with the new code. Constructing such state
                 transformations correctly and automatically remains an
                 open problem. This paper presents a solution called
                 Targeted Object Synthesis (TOS). TOS first executes the
                 same tests on the old and new program versions
                 separately, observing the program heap state at a few
                 corresponding points. Given two corresponding heap
                 states, TOS matches objects in the two versions using
                 key fields that uniquely identify objects and correlate
                 old and new-version objects. Given example object
                 pairs, TOS then synthesizes the simplest-possible
                 function that transforms an old-version object to its
                 new-version counterpart. We show that TOS is effective
                 on updates to four open-source server programs for
                 which it generates non-trivial transformation functions
                 that use conditionals, operate on collections, and fix
                 memory leaks. These transformations help programmers
                 understand their changes and apply dynamic software
                 updates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Sartor:2012:EMT,
  author =       "Jennfer B. Sartor and Lieven Eeckhout",
  title =        "Exploring multi-threaded {Java} application
                 performance on multicore hardware",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "281--296",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384638",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "While there have been many studies of how to schedule
                 applications to take advantage of increasing numbers of
                 cores in modern-day multicore processors, few have
                 focused on multi-threaded managed language applications
                 which are prevalent from the embedded to the server
                 domain. Managed languages complicate performance
                 studies because they have additional virtual machine
                 threads that collect garbage and dynamically compile,
                 closely interacting with application threads. Further
                 complexity is introduced as modern multicore machines
                 have multiple sockets and dynamic frequency scaling
                 options, broadening opportunities to reduce both power
                 and running time. In this paper, we explore the
                 performance of Java applications, studying how best to
                 map application and virtual machine (JVM) threads to a
                 multicore, multi-socket environment. We explore both
                 the cost of separating JVM threads from application
                 threads, and the opportunity to speed up or slow down
                 the clock frequency of isolated threads. We perform
                 experiments with the multi-threaded DaCapo benchmarks
                 and pseudojbb2005 running on the Jikes Research Virtual
                 Machine, on a dual-socket, 8-core Intel Nehalem machine
                 to reveal several novel, and sometimes
                 counter-intuitive, findings. We believe these insights
                 are a first but important step towards understanding
                 and optimizing managed language performance on modern
                 hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Kumar:2012:WSB,
  author =       "Vivek Kumar and Daniel Frampton and Stephen M.
                 Blackburn and David Grove and Olivier Tardieu",
  title =        "Work-stealing without the baggage",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "297--314",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384639",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Work-stealing is a promising approach for effectively
                 exploiting software parallelism on parallel hardware. A
                 programmer who uses work-stealing explicitly identifies
                 potential parallelism and the runtime then schedules
                 work, keeping otherwise idle hardware busy while
                 relieving overloaded hardware of its burden. Prior work
                 has demonstrated that work-stealing is very effective
                 in practice. However, work-stealing comes with a
                 substantial overhead: as much as 2x to 12x slowdown
                 over orthodox sequential code. In this paper we
                 identify the key sources of overhead in work-stealing
                 schedulers and present two significant refinements to
                 their implementation. We evaluate our work-stealing
                 designs using a range of benchmarks, four different
                 work-stealing implementations, including the popular
                 fork-join framework, and a range of architectures. On
                 these benchmarks, compared to orthodox sequential Java,
                 our fastest design has an overhead of just 15\%. By
                 contrast, fork-join has a 2.3x overhead and the
                 previous implementation of the system we use has an
                 overhead of 4.1x. These results and our insight into
                 the sources of overhead for work-stealing
                 implementations give further hope to an already
                 promising technique for exploiting increasingly
                 available hardware parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Bocq:2012:MUM,
  author =       "S{\'e}bastien Bocq and Koen Daenen",
  title =        "{Molecule}: using monadic and streaming {I/O} to
                 compose process networks on the {JVM}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "315--334",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384640",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Molecule is a domain specific language library
                 embedded in Scala for easing the creation of scalable
                 and modular concurrent applications on the JVM.
                 Concurrent applications are modeled as parallel process
                 networks that exchange information over mobile and
                 type-safe messaging interfaces. In this paper, we
                 present a concurrent programming environment that
                 combines functional and imperative programming. Using a
                 monad, we structure the sequential or parallel
                 coordination of user-level threads, without JVM
                 modifications or compiler support. Our mobile channel
                 interfaces expose reusable and parallelizable
                 higher-order functions, as if they were streams in a
                 lazily evaluated functional programming language. The
                 support for graceful termination of entire process
                 networks is simplified by integrating channel poisoning
                 with monadic exceptions and resource control. Our
                 runtime and system-level interfaces leverage message
                 batching and a novel flow parallel scheduler to limit
                 expensive context switches in multicore environments.
                 We illustrate the expressiveness and performance
                 benefits on a 24-core AMD Opteron machine with three
                 classical examples: a thread ring, a genuine prime
                 sieve and a chameneos-redux.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Kalibera:2012:BBA,
  author =       "Tomas Kalibera and Matthew Mole and Richard Jones and
                 Jan Vitek",
  title =        "A black-box approach to understanding concurrency in
                 {DaCapo}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "335--354",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384641",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Increasing levels of hardware parallelism are one of
                 the main challenges for programmers and implementers of
                 managed runtimes. Any concurrency or scalability
                 improvements must be evaluated experimentally. However,
                 application benchmarks available today may not reflect
                 the highly concurrent applications we anticipate in the
                 future. They may also behave in ways that VM developers
                 do not expect. We provide a set of platform independent
                 concurrency related metrics and an in-depth
                 observational study of current state of the art
                 benchmarks, discovering how concurrent they really are,
                 how they scale the work and how they synchronise and
                 communicate via shared memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Jo:2012:AEL,
  author =       "Youngjoon Jo and Milind Kulkarni",
  title =        "Automatically enhancing locality for tree traversals
                 with traversal splicing",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "355--374",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384643",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Generally applicable techniques for improving temporal
                 locality in irregular programs, which operate over
                 pointer-based data structures such as trees and graphs,
                 are scarce. Focusing on a subset of irregular programs,
                 namely, tree traversal algorithms like Barnes--Hut and
                 nearest neighbor, previous work has proposed point
                 blocking, a technique analogous to loop tiling in
                 regular programs, to improve locality. However point
                 blocking is highly dependent on point sorting, a
                 technique to reorder points so that consecutive points
                 will have similar traversals. Performing this a priori
                 sort requires an understanding of the semantics of the
                 algorithm and hence highly application specific
                 techniques. In this work, we propose traversal
                 splicing, a new, general, automatic locality
                 optimization for irregular tree traversal codes, that
                 is less sensitive to point order, and hence can deliver
                 substantially better performance, even in the absence
                 of semantic information. For six benchmark algorithms,
                 we show that traversal splicing can deliver
                 single-thread speedups of up to 9.147 (geometric mean:
                 3.095) over baseline implementations, and up to 4.752
                 (geometric mean: 2.079) over point-blocked
                 implementations. Further, we show that in many cases,
                 automatically applying traversal splicing to a baseline
                 implementation yields performance that is better than
                 carefully hand-optimized implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Prountzos:2012:ESS,
  author =       "Dimitrios Prountzos and Roman Manevich and Keshav
                 Pingali",
  title =        "{Elixir}: a system for synthesizing concurrent graph
                 programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "375--394",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384644",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Algorithms in new application areas like machine
                 learning and network analysis use ``irregular'' data
                 structures such as graphs, trees and sets. Writing
                 efficient parallel code in these problem domains is
                 very challenging because it requires the programmer to
                 make many choices: a given problem can usually be
                 solved by several algorithms, each algorithm may have
                 many implementations, and the best choice of algorithm
                 and implementation can depend not only on the
                 characteristics of the parallel platform but also on
                 properties of the input data such as the structure of
                 the graph. One solution is to permit the application
                 programmer to experiment with different algorithms and
                 implementations without writing every variant from
                 scratch. Auto-tuning to find the best variant is a more
                 ambitious solution. These solutions require a system
                 for automatically producing efficient parallel
                 implementations from high-level specifications. Elixir,
                 the system described in this paper, is the first step
                 towards this ambitious goal. Application programmers
                 write specifications that consist of an operator, which
                 describes the computations to be performed, and a
                 schedule for performing these computations. Elixir uses
                 sophisticated inference techniques to produce efficient
                 parallel code from such specifications. We used Elixir
                 to automatically generate many parallel implementations
                 for three irregular problems: breadth-first search,
                 single source shortest path, and betweenness-centrality
                 computation. Our experiments show that the best
                 generated variants can be competitive with handwritten
                 code for these problems from other research groups; for
                 some inputs, they even outperform the handwritten
                 versions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Liu:2012:CED,
  author =       "Yanhong A. Liu and Scott D. Stoller and Bo Lin and
                 Michael Gorbovitski",
  title =        "From clarity to efficiency for distributed
                 algorithms",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "395--410",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384645",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes a very high-level language for
                 clear description of distributed algorithms and
                 optimizations necessary for generating efficient
                 implementations. The language supports high-level
                 control flows where complex synchronization conditions
                 can be expressed using high-level queries, especially
                 logic quantifications, over message history sequences.
                 Unfortunately, the programs would be extremely
                 inefficient, including consuming unbounded memory, if
                 executed straightforwardly. We present new
                 optimizations that automatically transform complex
                 synchronization conditions into incremental updates of
                 necessary auxiliary values as messages are sent and
                 received. The core of the optimizations is the first
                 general method for efficient implementation of logic
                 quantifications. We have developed an operational
                 semantics of the language, implemented a prototype of
                 the compiler and the optimizations, and successfully
                 used the language and implementation on a variety of
                 important distributed algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Leino:2012:PEJ,
  author =       "K. Rustan M. Leino and Aleksandar Milicevic",
  title =        "Program extrapolation with {Jennisys}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "411--430",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384646",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The desired behavior of a program can be described
                 using an abstract model. Compiling such a model into
                 executable code requires advanced compilation
                 techniques known as synthesis. This paper presents an
                 object-based language, called Jennisys, where
                 programming is done by introducing an abstract model,
                 defining a concrete data representation for the model,
                 and then being aided by automatic synthesis to produce
                 executable code. The paper also presents a synthesis
                 technique for the language. The technique is built on
                 an automatic program verifier that, via an underlying
                 SMT solver, is capable of providing concrete models to
                 failed verifications. The technique proceeds by
                 obtaining sample input/output values from concrete
                 models and then extrapolating programs from the sample
                 points. The synthesis aims to produce code with
                 assignments, branching structure, and possibly
                 recursive calls. It is the first to synthesize code
                 that creates and uses objects in dynamic data
                 structures or aggregate objects. A prototype of the
                 language and synthesis technique has been
                 implemented.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Kling:2012:BDI,
  author =       "Michael Kling and Sasa Misailovic and Michael Carbin
                 and Martin Rinard",
  title =        "{Bolt}: on-demand infinite loop escape in unmodified
                 binaries",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "431--450",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384648",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Bolt, a novel system for escaping from
                 infinite and long-running loops. Directed by a user,
                 Bolt can attach to a running process and determine if
                 the program is executing an infinite loop. If so, Bolt
                 can deploy multiple strategies to escape the loop,
                 restore the responsiveness of the program, and enable
                 the program to deliver useful output. Bolt operates on
                 stripped x86 and x64 binaries, dynamically attaches and
                 detaches to and from the program as needed, and
                 dynamically detects loops and creates program state
                 checkpoints to enable exploration of different escape
                 strategies. Bolt can detect and escape from loops in
                 off-the-shelf software, without available source code,
                 and with no overhead in standard production use.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Huang:2012:LSC,
  author =       "Jeff Huang and Charles Zhang",
  title =        "{LEAN}: simplifying concurrency bug reproduction via
                 replay-supported execution reduction",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "451--466",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384649",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Debugging concurrent programs is known to be difficult
                 due to scheduling non-determinism. The technique of
                 multiprocessor deterministic replay substantially
                 assists debugging by making the program execution
                 reproducible. However, facing the huge replay traces
                 and long replay time, the debugging task remains
                 stunningly challenging for long running executions. We
                 present a new technique, LEAN, on top of replay, that
                 significantly reduces the complexity of the replay
                 trace and the length of the replay time without losing
                 the determinism in reproducing concurrency bugs. The
                 cornerstone of our work is a redundancy criterion that
                 characterizes the redundant computation in a buggy
                 trace. Based on the redundancy criterion, we have
                 developed two novel techniques to automatically
                 identify and remove redundant threads and instructions
                 in the bug reproduction execution. Our evaluation
                 results with several real world concurrency bugs in
                 large complex server programs demonstrate that LEAN is
                 able to reduce the size, the number of threads, and the
                 number of thread context switches of the replay trace
                 by orders of magnitude, and accordingly greatly shorten
                 the replay time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Effinger-Dean:2012:IIF,
  author =       "Laura Effinger-Dean and Brandon Lucia and Luis Ceze
                 and Dan Grossman and Hans-J. Boehm",
  title =        "{IFRit}: interference-free regions for dynamic
                 data-race detection",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "467--484",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384650",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a new algorithm for dynamic data-race
                 detection. Our algorithm reports no false positives and
                 runs on arbitrary C and C++ code. Unlike previous
                 algorithms, we do not have to instrument every memory
                 access or track a full happens-before relation. Our
                 data-race detector, which we call IFRit, is based on a
                 run-time abstraction called an interference-free region
                 (IFR). An IFR is an interval of one thread's execution
                 during which any write to a specific variable by a
                 different thread is a data race. We insert
                 instrumentation at compile time to monitor active IFRs
                 at run-time. If the runtime observes overlapping IFRs
                 for conflicting accesses to the same variable in two
                 different threads, it reports a race. The static
                 analysis aggregates information for multiple accesses
                 to the same variable, avoiding the expense of having to
                 instrument every memory access in the program. We
                 directly compare IFRit to FastTrack and
                 ThreadSanitizer, two state-of-the-art fully-precise
                 data-race detectors. We show that IFRit imposes a
                 fraction of the overhead of these detectors. We show
                 that for the PARSEC benchmarks, and several real-world
                 applications, IFRit finds many of the races detected by
                 a fully-precise detector. We also demonstrate that
                 sampling can further reduce IFRit's performance
                 overhead without completely forfeiting precision.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Yu:2012:MCD,
  author =       "Jie Yu and Satish Narayanasamy and Cristiano Pereira
                 and Gilles Pokam",
  title =        "{Maple}: a coverage-driven testing tool for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "485--502",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384651",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Testing multithreaded programs is a hard problem,
                 because it is challenging to expose those rare
                 interleavings that can trigger a concurrency bug. We
                 propose a new thread interleaving coverage-driven
                 testing tool called Maple that seeks to expose untested
                 thread interleavings as much as possible. It memoizes
                 tested interleavings and actively seeks to expose
                 untested interleavings for a given test input to
                 increase interleaving coverage. We discuss several
                 solutions to realize the above goal. First, we discuss
                 a coverage metric based on a set of interleaving
                 idioms. Second, we discuss an online technique to
                 predict untested interleavings that can potentially be
                 exposed for a given test input. Finally, the predicted
                 untested interleavings are exposed by actively
                 controlling the thread schedule while executing for the
                 test input. We discuss our experiences in using the
                 tool to expose several known and unknown bugs in
                 real-world applications such as Apache and MySQL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Dubrau:2012:TM,
  author =       "Anton Willy Dubrau and Laurie Jane Hendren",
  title =        "Taming {MATLAB}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "503--522",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384653",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "MATLAB is a dynamic scientific language used by
                 scientists, engineers and students worldwide. Although
                 MATLAB is very suitable for rapid prototyping and
                 development, MATLAB users often want to convert their
                 final MATLAB programs to a static language such as
                 FORTRAN. This paper presents an extensible
                 object-oriented toolkit for supporting the generation
                 of static programs from dynamic MATLAB programs. Our
                 open source toolkit, called the MATLAB Tamer,
                 identifies a large tame subset of MATLAB, supports the
                 generation of a specialized Tame IR for that subset,
                 provides a principled approach to handling the large
                 number of builtin MATLAB functions, and supports an
                 extensible interprocedural value analysis for
                 estimating MATLAB types and call graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Siddiqui:2012:SSE,
  author =       "Junaid Haroon Siddiqui and Sarfraz Khurshid",
  title =        "Scaling symbolic execution using ranged analysis",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "523--536",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384654",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces a novel approach to scale
                 symbolic execution --- a program analysis technique for
                 systematic exploration of bounded execution paths---for
                 test input generation. While the foundations of
                 symbolic execution were developed over three decades
                 ago, recent years have seen a real resurgence of the
                 technique, specifically for systematic bug finding.
                 However, scaling symbolic execution remains a primary
                 technical challenge due to the inherent complexity of
                 the path-based exploration that lies at core of the
                 technique. Our key insight is that the state of the
                 analysis can be represented highly compactly: a test
                 input is all that is needed to effectively encode the
                 state of a symbolic execution run. We present ranged
                 symbolic execution, which embodies this insight and
                 uses two test inputs to define a range, i.e., the
                 beginning and end, for a symbolic execution run. As an
                 application of our approach, we show how it enables
                 scalability by distributing the path exploration---both
                 in a sequential setting with a single worker node and
                 in a parallel setting with multiple workers. As an
                 enabling technology, we leverage the open-source,
                 state-of-the-art symbolic execution tool KLEE.
                 Experimental results using 71 programs chosen from the
                 widely deployed GNU Coreutils set of Unix utilities
                 show that our approach provides a significant speedup
                 over KLEE. For example, using 10 worker cores, we
                 achieve an average speed-up of 6.6X for the 71
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Tobin-Hochstadt:2012:HOS,
  author =       "Sam Tobin-Hochstadt and David {Van Horno}",
  title =        "Higher-order symbolic execution via contracts",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "537--554",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384655",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new approach to automated reasoning about
                 higher-order programs by extending symbolic execution
                 to use behavioral contracts as symbolic values, thus
                 enabling symbolic approximation of higher-order
                 behavior. Our approach is based on the idea of an
                 abstract reduction semantics that gives an operational
                 semantics to programs with both concrete and symbolic
                 components. Symbolic components are approximated by
                 their contract and our semantics gives an operational
                 interpretation of contracts-as-values. The result is an
                 executable semantics that soundly predicts program
                 behavior, including contract failures, for all possible
                 instantiations of symbolic components. We show that our
                 approach scales to an expressive language of contracts
                 including arbitrary programs embedded as predicates,
                 dependent function contracts, and recursive contracts.
                 Supporting this rich language of specifications leads
                 to powerful symbolic reasoning using existing program
                 constructs. We then apply our approach to produce a
                 verifier for contract correctness of components,
                 including a sound and computable approximation to our
                 semantics that facilitates fully automated contract
                 verification. Our implementation is capable of
                 verifying contracts expressed in existing programs, and
                 of justifying contract-elimination optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Rosu:2012:CRU,
  author =       "Grigore Rosu and Andrei Stefanescu",
  title =        "Checking reachability using matching logic",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "555--574",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384656",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a verification framework that is
                 parametric in a (trusted) operational semantics of some
                 programming language. The underlying proof system is
                 language-independent and consists of eight proof rules.
                 The proof system is proved partially correct and
                 relatively complete (with respect to the programming
                 language configuration model). To show its
                 practicality, the generic framework is instantiated
                 with a fragment of C and evaluated with encouraging
                 results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Zhao:2012:HCP,
  author =       "Haiping Zhao and Iain Proctor and Minghui Yang and Xin
                 Qi and Mark Williams and Qi Gao and Guilherme Ottoni
                 and Andrew Paroski and Scott MacVicar and Jason Evans
                 and Stephen Tu",
  title =        "The {HipHop} compiler for {PHP}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "575--586",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384658",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scripting languages are widely used to quickly
                 accomplish a variety of tasks because of the high
                 productivity they enable. Among other reasons, this
                 increased productivity results from a combination of
                 extensive libraries, fast development cycle, dynamic
                 typing, and polymorphism. The dynamic features of
                 scripting languages are traditionally associated with
                 interpreters, which is the approach used to implement
                 most scripting languages. Although easy to implement,
                 interpreters are generally slow, which makes scripting
                 languages prohibitive for implementing large,
                 CPU-intensive applications. This efficiency problem is
                 particularly important for PHP given that it is the
                 most commonly used language for server-side web
                 development. This paper presents the design,
                 implementation, and an evaluation of the HipHop
                 compiler for PHP. HipHop goes against the standard
                 practice and implements a very dynamic language through
                 static compilation. After describing the most
                 challenging PHP features to support through static
                 compilation, this paper presents HipHop's design and
                 techniques that support almost all PHP features. We
                 then present a thorough evaluation of HipHop running
                 both standard benchmarks and the Facebook web site.
                 Overall, our experiments demonstrate that HipHop is
                 about 5.5x faster than standard, interpreted PHP
                 engines. As a result, HipHop has reduced the number of
                 servers needed to run Facebook and other web sites by a
                 factor between 4 and 6, thus drastically cutting
                 operating costs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Chugh:2012:DTJ,
  author =       "Ravi Chugh and David Herman and Ranjit Jhala",
  title =        "Dependent types for {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "587--606",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384659",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Dependent JavaScript (DJS), a statically
                 typed dialect of the imperative, object-oriented,
                 dynamic language. DJS supports the particularly
                 challenging features such as run-time type-tests,
                 higher-order functions, extensible objects, prototype
                 inheritance, and arrays through a combination of nested
                 refinement types, strong updates to the heap, and heap
                 unrolling to precisely track prototype hierarchies.
                 With our implementation of DJS, we demonstrate that the
                 type system is expressive enough to reason about a
                 variety of tricky idioms found in small examples drawn
                 from several sources, including the popular book
                 JavaScript: The Good Parts and the SunSpider benchmark
                 suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Meawad:2012:EBS,
  author =       "Fadi Meawad and Gregor Richards and Flor{\'e}al
                 Morandat and Jan Vitek",
  title =        "{Eval} begone!: semi-automated removal of {\tt eval}
                 from {JavaScript} programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "607--620",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384660",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Eval endows JavaScript developers with great power. It
                 allows developers and end-users, by turning text into
                 executable code, to seamlessly extend and customize the
                 behavior of deployed applications as they are running.
                 With great power comes great responsibility, though not
                 in our experience. In previous work we demonstrated
                 through a large corpus study that programmers wield
                 that power in rather irresponsible and arbitrary ways.
                 We showed that most calls to eval fall into a small
                 number of very predictable patterns. We argued that
                 those patterns could easily be recognized by an
                 automated algorithm and that they could almost always
                 be replaced with safer JavaScript idioms. In this paper
                 we set out to validate our claim by designing and
                 implementing a tool, which we call Evalorizer, that can
                 assist programmers in getting rid of their unneeded
                 evals. We use the tool to remove eval from a real-world
                 website and validated our approach over logs taken from
                 the top 100 websites with a success rate over 97\%
                 under an open world assumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Kang:2012:FSJ,
  author =       "Seonghoon Kang and Sukyoung Ryu",
  title =        "Formal specification of a {JavaScript} module system",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "621--638",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384661",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The JavaScript programming language, originally
                 developed as a simple scripting language, is now the
                 language of choice for web applications. All the top
                 100 sites on the web use JavaScript and its use outside
                 web pages is rapidly growing. However, JavaScript is
                 not yet ready for programming in the large: it does not
                 support a module system. Lack of namespaces introduces
                 module patterns, and makes it difficult to use multiple
                 JavaScript frameworks together. In this paper, we
                 propose a formal specification of a JavaScript module
                 system. A module system for JavaScript will allow safe
                 and incremental development of JavaScript web
                 applications. While the next version of the JavaScript
                 standard proposes a module system, it informally
                 describes its design in prose. We formally specify a
                 module system as an extension to the existing
                 JavaScript language, and rigorously describe its
                 semantics via desugaring to LambdaJS, a prior core
                 calculus for JavaScript. We implement the desugaring
                 process and show its faithfulness using real-world test
                 suites. Finally, we define a set of properties for
                 valid JavaScript programs using modules and formally
                 prove that the proposed module system satisfies the
                 validity properties.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Barowy:2012:API,
  author =       "Daniel W. Barowy and Charlie Curtsinger and Emery D.
                 Berger and Andrew McGregor",
  title =        "{AutoMan}: a platform for integrating human-based and
                 digital computation",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "639--654",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384663",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Humans can perform many tasks with ease that remain
                 difficult or impossible for computers. Crowdsourcing
                 platforms like Amazon's Mechanical Turk make it
                 possible to harness human-based computational power at
                 an unprecedented scale. However, their utility as a
                 general-purpose computational platform remains limited.
                 The lack of complete automation makes it difficult to
                 orchestrate complex or interrelated tasks. Scheduling
                 more human workers to reduce latency costs real money,
                 and jobs must be monitored and rescheduled when workers
                 fail to complete their tasks. Furthermore, it is often
                 difficult to predict the length of time and payment
                 that should be budgeted for a given task. Finally, the
                 results of human-based computations are not necessarily
                 reliable, both because human skills and accuracy vary
                 widely, and because workers have a financial incentive
                 to minimize their effort. This paper introduces
                 AutoMan, the first fully automatic crowdprogramming
                 system. AutoMan integrates human-based computations
                 into a standard programming language as ordinary
                 function calls, which can be intermixed freely with
                 traditional functions. This abstraction lets AutoMan
                 programmers focus on their programming logic. An
                 AutoMan program specifies a confidence level for the
                 overall computation and a budget. The AutoMan runtime
                 system then transparently manages all details necessary
                 for scheduling, pricing, and quality control. AutoMan
                 automatically schedules human tasks for each
                 computation until it achieves the desired confidence
                 level; monitors, reprices, and restarts human tasks as
                 necessary; and maximizes parallelism across human
                 workers while staying under budget.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Datta:2012:TVW,
  author =       "Subhajit Datta and Renuka Sindhgatta and Bikram
                 Sengupta",
  title =        "Talk versus work: characteristics of developer
                 collaboration on the {Jazz} platform",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "655--668",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384664",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "IBM's Jazz initiative offers a state-of-the-art
                 collaborative development environment (CDE)
                 facilitating developer interactions around
                 interdependent units of work. In this paper, we analyze
                 development data across two versions of a major IBM
                 product developed on the Jazz platform, covering in
                 total 19 months of development activity, including
                 17,000+ work items and 61,000+ comments made by more
                 than 190 developers in 35 locations. By examining the
                 relation between developer talk and work, we find
                 evidence that developers maintain a reasonably high
                 level of connectivity with peer developers with whom
                 they share work dependencies, but the span of a
                 developer's communication goes much beyond the known
                 dependencies of his/her work items. Using multiple
                 linear regression models, we find that the number of
                 defects owned by a developer is impacted by the number
                 of other developers (s)he is connected through talk,
                 his/her interpersonal influence in the network of work
                 dependencies, the number of work items (s)he comments
                 on, and the number work items (s)he owns. These effects
                 are maintained even after controlling for workload,
                 role, work dependency, and connection related factors.
                 We discuss the implications of our results for
                 collaborative software development and project
                 governance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Muulu:2012:SAI,
  author =       "Kivan{\c{c}} Mu{\^u}lu and Yuriy Brun and Reid Holmes
                 and Michael D. Ernst and David Notkin",
  title =        "Speculative analysis of integrated development
                 environment recommendations",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "669--682",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384665",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern integrated development environments make
                 recommendations and automate common tasks, such as
                 refactorings, auto-completions, and error corrections.
                 However, these tools present little or no information
                 about the consequences of the recommended changes. For
                 example, a rename refactoring may: modify the source
                 code without changing program semantics; modify the
                 source code and (incorrectly) change program semantics;
                 modify the source code and (incorrectly) create
                 compilation errors; show a name collision warning and
                 require developer input; or show an error and not
                 change the source code. Having to compute the
                 consequences of a recommendation --- either mentally or
                 by making source code changes --- puts an extra burden
                 on the developers. This paper aims to reduce this
                 burden with a technique that informs developers of the
                 consequences of code transformations. Using Eclipse
                 Quick Fix as a domain, we describe a plug-in, Quick Fix
                 Scout, that computes the consequences of Quick Fix
                 recommendations. In our experiments, developers
                 completed compilation-error removal tasks 10\% faster
                 when using Quick Fix Scout than Quick Fix, although the
                 sample size was not large enough to show statistical
                 significance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Mayer:2012:ESI,
  author =       "Clemens Mayer and Stefan Hanenberg and Romain Robbes
                 and {\'E}ric Tanter and Andreas Stefik",
  title =        "An empirical study of the influence of static type
                 systems on the usability of undocumented software",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "683--702",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384666",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Abstract Although the study of static and dynamic type
                 systems plays a major role in research, relatively
                 little is known about the impact of type systems on
                 software development. Perhaps one of the more common
                 arguments for static type systems in languages such as
                 Java or C++ is that they require developers to annotate
                 their code with type names, which is thus claimed to
                 improve the documentation of software. In contrast, one
                 common argument against static type systems is that
                 they decrease flexibility, which may make them harder
                 to use. While these arguments are found in the
                 literature, rigorous empirical evidence is lacking. We
                 report on a controlled experiment where 27 subjects
                 performed programming tasks on an undocumented API with
                 a static type system (requiring type annotations) as
                 well as a dynamic type system (which does not). Our
                 results show that for some tasks, programmers had
                 faster completion times using a static type system,
                 while for others, the opposite held. We conduct an
                 exploratory study to try and theorize why.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Tseng:2012:SDT,
  author =       "Hung-Wei Tseng and Dean Michael Tullsen",
  title =        "Software data-triggered threads",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "703--716",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384668",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The data-triggered threads (DTT) programming and
                 execution model can increase parallelism and eliminate
                 redundant computation. However, the initial proposal
                 requires significant architecture support, which
                 impedes existing applications and architectures from
                 taking advantage of this model. This work proposes a
                 pure software solution that supports the DTT model
                 without any hardware support. This research uses a
                 prototype compiler and runtime libraries running on top
                 of existing machines. Several enhancements to the
                 initial software implementation are presented, which
                 further improve the performance. The software runtime
                 system improves the performance of serial C SPEC
                 benchmarks by 15\% on a Nehalem processor, but by over
                 7X over the full suite of single-thread applications.
                 It is shown that the DTT model can work in conjunction
                 with traditional parallelism. The DTT model provides up
                 to 64X speedup over parallel applications exploiting
                 traditional parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Anderson:2012:ECP,
  author =       "Zachary Anderson",
  title =        "Efficiently combining parallel software using
                 fine-grained, language-level, hierarchical resource
                 management policies",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "717--736",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384669",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents Poli-C, a language extension,
                 runtime library, and system daemon enabling
                 fine-grained, language-level, hierarchical resource
                 management policies. Poli-C is suitable for use in
                 applications that compose parallel libraries,
                 frameworks, and programs. In particular, we have added
                 a powerful new statement to C for expressing resource
                 limits and guarantees in such a way that programmers
                 can set resource management policies even when the
                 source code of parallel libraries and frameworks is not
                 available. Poli-C enables application programmers to
                 manage any resource exposed by the underlying OS, for
                 example cores or IO bandwidth. Additionally, we have
                 developed a domain-specific language for defining
                 high-level resource management policies, and a facility
                 for extending the kinds of resources that can be
                 managed with our language extension. Finally, through a
                 number of useful variations, our design offers a high
                 degree of composability. We evaluate Poli-C by way of
                 three case-studies: a scientific application, an image
                 processing webserver, and a pair of parallel database
                 join implementations. We found that using Poli-C yields
                 efficiency gains that require the addition of only a
                 few lines of code to applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Huang:2012:EPS,
  author =       "Jeff Huang and Charles Zhang",
  title =        "Execution privatization for scheduler-oblivious
                 concurrent programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "737--752",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384670",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Making multithreaded execution less non-deterministic
                 is a promising solution to address the difficulty of
                 concurrent programming plagued by the non-deterministic
                 thread scheduling. In fact, a vast category of
                 concurrent programs are scheduler-oblivious: their
                 execution is deterministic, regardless of the
                 scheduling behavior. We present and formally prove a
                 fundamental observation of the privatizability property
                 for scheduler-oblivious programs, that paves the
                 theoretical foundation for privatizing shared data
                 accesses on a path segment. With privatization, the
                 non-deterministic thread interleavings on the
                 privatized accesses are isolated and as the consequence
                 many concurrency problems are alleviated. We further
                 present a path and context sensitive privatization
                 algorithm that safely privatizes the program without
                 introducing any additional program behavior. Our
                 evaluation results show that the privatization
                 opportunity pervasively exists in real world large
                 complex concurrent systems. Through privatization,
                 several real concurrency bugs are fixed and notable
                 performance improvements are also achieved on
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Imam:2012:ITP,
  author =       "Shams M. Imam and Vivek Sarkar",
  title =        "Integrating task parallelism with actors",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "753--772",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384671",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces a unified concurrent programming
                 model combining the previously developed Actor Model
                 (AM) and the task-parallel Async-Finish Model (AFM).
                 With the advent of multi-core computers, there is a
                 renewed interest in programming models that can support
                 a wide range of parallel programming patterns. The
                 proposed unified model shows how the divide-and-conquer
                 approach of the AFM and the no-shared mutable state and
                 event-driven philosophy of the AM can be combined to
                 solve certain classes of problems more efficiently and
                 productively than either of the aforementioned models
                 individually. The unified model adds actor creation and
                 coordination to the AFM, while also enabling
                 parallelization within actors. This paper describes two
                 implementations of the unified model as extensions of
                 Habanero-Java and Habanero-Scala. The unified model
                 adds to the foundations of parallel programs, and to
                 the tools available for the programmer to aid in
                 productivity and performance while developing parallel
                 software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Kastner:2012:VAM,
  author =       "Christian K{\"a}stner and Klaus Ostermann and
                 Sebastian Erdweg",
  title =        "A variability-aware module system",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "773--792",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384673",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Module systems enable a divide and conquer strategy to
                 software development. To implement compile-time
                 variability in software product lines, modules can be
                 composed in different combinations. However, this way,
                 variability dictates a dominant decomposition. As an
                 alternative, we introduce a variability-aware module
                 system that supports compile-time variability inside a
                 module and its interface. So, each module can be
                 considered a product line that can be type checked in
                 isolation. Variability can crosscut multiple modules.
                 The module system breaks with the antimodular tradition
                 of a global variability model in product-line
                 development and provides a path toward software
                 ecosystems and product lines of product lines developed
                 in an open fashion. We discuss the design and
                 implementation of such a module system on a core
                 calculus and provide an implementation for C as part of
                 the TypeChef project. Our implementation supports
                 variability inside modules from {\tt \#ifdef}
                 preprocessor directives and variable linking at the
                 composition level. With our implementation, we type
                 check all configurations of all modules of the open
                 source product line Busybox with 811 compile-time
                 options, perform linker check of all configurations,
                 and report found type and linker errors --- without
                 resorting to a brute-force strategy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Takikawa:2012:GTF,
  author =       "Asumu Takikawa and T. Stephen Strickland and Christos
                 Dimoulas and Sam Tobin-Hochstadt and Matthias
                 Felleisen",
  title =        "Gradual typing for first-class classes",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "793--810",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384674",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic type-checking and object-oriented programming
                 often go hand-in-hand; scripting languages such as
                 Python, Ruby, and JavaScript all embrace
                 object-oriented (OO) programming. When scripts written
                 in such languages grow and evolve into large programs,
                 the lack of a static type discipline reduces
                 maintainability. A programmer may thus wish to migrate
                 parts of such scripts to a sister language with a
                 static type system. Unfortunately, existing type
                 systems neither support the flexible OO composition
                 mechanisms found in scripting languages nor accommodate
                 sound interoperation with untyped code. In this paper,
                 we present the design of a gradual typing system that
                 supports sound interaction between statically- and
                 dynamically-typed units of class-based code. The type
                 system uses row polymorphism for classes and thus
                 supports mixin-based OO composition. To protect
                 migration of mixins from typed to untyped components,
                 the system employs a novel form of contracts that
                 partially seal classes. The design comes with a theorem
                 that guarantees the soundness of the type system even
                 in the presence of untyped components.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Tardieu:2012:CK,
  author =       "Olivier Tardieu and Nathaniel Nystrom and Igor
                 Peshansky and Vijay Saraswat",
  title =        "Constrained kinds",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "811--830",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384675",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern object-oriented languages such as X10 require a
                 rich framework for types capable of expressing both
                 value-dependency and genericity, and supporting
                 pluggable, domain-specific extensions. In earlier work,
                 we presented a framework for constrained types in
                 object-oriented languages, parametrized by an
                 underlying constraint system. Types are viewed as
                 formulas {Cc} where C is the name of a class or an
                 interface and c is a constraint on the immutable
                 instance state (the properties) of C. Constraint
                 systems are a very expressive framework for partial
                 information. Many (value-)dependent type systems for
                 object-oriented languages can be viewed as constrained
                 types. This paper extends the constrained types
                 approach to handle type-dependency (``genericity'').
                 The key idea is to introduce constrained kinds: in the
                 same way that constraints on values can be used to
                 define constrained types, constraints on types can
                 define constrained kinds. We develop a core programming
                 language with constrained kinds. Generic types are
                 supported by introducing type variables---literally,
                 variables with ``type'' Type---and permitting programs
                 to impose subtyping and equality constraints on such
                 variables. We formalize the type-checking rules and
                 establish soundness. While the language now intertwines
                 constraints on types and values, its type system
                 remains parametric in the choice of the value
                 constraint system (language and solver). We demonstrate
                 that constrained kinds are expressive and practical and
                 sketch possible extensions with a discussion of the
                 design and implementation of X10.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Cohen:2012:ET,
  author =       "Michael Cohen and Haitao Steve Zhu and Emgin Ezgi
                 Senem and Yu David Liu",
  title =        "Energy types",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "831--850",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384676",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a novel type system to promote and
                 facilitate energy-aware programming. Energy Types is
                 built upon a key insight into today's energy-efficient
                 systems and applications: despite the popular
                 perception that energy and power can only be described
                 in joules and watts, real-world energy management is
                 often based on discrete phases and modes, which in turn
                 can be reasoned about by type systems very effectively.
                 A phase characterizes a distinct pattern of program
                 workload, and a mode represents an energy state the
                 program is expected to execute in. This paper describes
                 a programming model where phases and modes can be
                 intuitively specified by programmers or inferred by the
                 compiler as type information. It demonstrates how a
                 type-based approach to reasoning about phases and modes
                 can help promote energy efficiency. The soundness of
                 our type system and the invariants related to
                 inter-phase and inter-mode interactions are rigorously
                 proved. Energy Types is implemented as the core of a
                 prototyped object-oriented language ET for smartphone
                 programming. Preliminary studies show ET can lead to
                 significant energy savings for Android Apps.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Wu:2012:EIS,
  author =       "Bo Wu and Zhijia Zhao and Xipeng Shen and Yunlian
                 Jiang and Yaoqing Gao and Raul Silvera",
  title =        "Exploiting inter-sequence correlations for program
                 behavior prediction",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "851--866",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384678",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Prediction of program dynamic behaviors is fundamental
                 to program optimizations, resource management, and
                 architecture reconfigurations. Most existing predictors
                 are based on locality of program behaviors, subject to
                 some inherent limitations. In this paper, we revisit
                 the design philosophy and systematically explore a
                 second source of clues: statistical correlations
                 between the behavior sequences of different program
                 entities. Concentrated on loops, it examines the
                 correlations' existence, strength, and values in
                 enhancing the design of program behavior predictors. It
                 creates the first taxonomy of program behavior sequence
                 patterns. It develops a new form of predictors, named
                 sequence predictors, to effectively translate the
                 correlations into large-scope, proactive predictions of
                 program behavior sequences. It demonstrates the
                 usefulness of the prediction in dynamic version
                 selection and loop importance estimation, showing 19\%
                 average speedup on a number of real-world utility
                 applications. By taking scope and timing of behavior
                 prediction as the first-order design objectives, the
                 new approach overcomes limitations of existing program
                 behavior predictors, opening up many new opportunities
                 for runtime optimizations at various layers of
                 computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Ausiello:2012:KCC,
  author =       "Giorgio Ausiello and Camil Demetrescu and Irene
                 Finocchi and Donatella Firmani",
  title =        "$k$-Calling context profiling",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "867--878",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384679",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Calling context trees are one of the most fundamental
                 data structures for representing the interprocedural
                 control flow of a program, providing valuable
                 information for program understanding and optimization.
                 Nodes of a calling context tree associate performance
                 metrics to whole distinct paths in the call graph
                 starting from the root function. However, no explicit
                 information is provided for detecting short hot
                 sequences of activations, which may be a better
                 optimization target in large modular programs where
                 groups of related functions are reused in many
                 different parts of the code. Furthermore, calling
                 context trees can grow prohibitively large in some
                 scenarios. Another classical approach, called edge
                 profiling, collects performance metrics for
                 caller-callee pairs in the call graph, allowing it to
                 detect hot paths of fixed length one. We study a
                 generalization of edge and context-sensitive profiles
                 by introducing a novel data structure called k-calling
                 context forest (k-CCF). Nodes in a k-CCF associate
                 performance metrics to paths of length at most k that
                 lead to each distinct routine of the program, providing
                 edge profiles for k=1, full context-sensitive profiles
                 for k equal to infinity, as well as any other
                 intermediate point in the spectrum. We study the
                 properties of the k-CCF both theoretically and
                 experimentally on a large suite of prominent Linux
                 applications, showing how to construct it efficiently
                 and discussing its relationships with the calling
                 context tree. Our experiments show that the k-CCF can
                 provide effective space-accuracy tradeoffs for
                 interprocedural contextual profiling, yielding useful
                 clues to the hot spots of a program that may be hidden
                 in a calling context tree and using less space for
                 small values of k, which appear to be the most
                 interesting in practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Huang:2012:RRC,
  author =       "Wei Huang and Ana Milanova and Werner Dietl and
                 Michael D. Ernst",
  title =        "{Reim \& ReImInfer}: checking and inference of
                 reference immutability and method purity",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "879--896",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384680",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reference immutability ensures that a reference is not
                 used to modify the referenced object, and enables the
                 safe sharing of object structures. A pure method does
                 not cause side-effects on the objects that existed in
                 the pre-state of the method execution. Checking and
                 inference of reference immutability and method purity
                 enables a variety of program analyses and
                 optimizations. We present ReIm, a type system for
                 reference immutability, and ReImInfer, a corresponding
                 type inference analysis. The type system is concise and
                 context-sensitive. The type inference analysis is
                 precise and scalable, and requires no manual
                 annotations. In addition, we present a novel
                 application of the reference immutability type system:
                 method purity inference. To support our theoretical
                 results, we implemented the type system and the type
                 inference analysis for Java. We include a type checker
                 to verify the correctness of the inference result.
                 Empirical results on Java applications and libraries of
                 up to 348kLOC show that our approach achieves both
                 scalability and precision.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Bao:2012:WBS,
  author =       "Tao Bao and Yunhui Zheng and Xiangyu Zhang",
  title =        "White box sampling in uncertain data processing
                 enabled by program analysis",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "897--914",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384681",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Sampling is a very important and low-cost approach to
                 uncertain data processing, in which output variations
                 caused by input errors are sampled. Traditional methods
                 tend to treat a program as a blackbox. In this paper,
                 we show that through program analysis, we can expose
                 the internals of sample executions so that the process
                 can become more selective and focused. In particular,
                 we develop a sampling runtime that can selectively
                 sample in input error bounds to expose discontinuity in
                 output functions. It identifies all the program factors
                 that can potentially lead to discontinuity and hash the
                 values of such factors during execution in a
                 cost-effective way. The hash values are used to guide
                 the sampling process. Our results show that the
                 technique is very effective for real-world programs. It
                 can achieve the precision of a high sampling rate with
                 the cost of a lower sampling rate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Lucas:2012:DPM,
  author =       "Charles Lucas and Sebastian Elbaum and David S.
                 Rosenblum",
  title =        "Detecting problematic message sequences and
                 frequencies in distributed systems",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "915--926",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384683",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Testing the components of a distributed system is
                 challenging as it requires consideration of not just
                 the state of a component, but also the sequence of
                 messages it may receive from the rest of the system or
                 the environment. Such messages may vary in type and
                 content, and more particularly, in the frequency at
                 which they are generated. All of these factors, in the
                 right combination, may lead to faulty behavior. In this
                 paper we present an approach to address these
                 challenges by systematically analyzing a component in a
                 distributed system to identify specific message
                 sequences and frequencies at which a failure can occur.
                 At the core of the analysis is the generation of a test
                 driver that defines the space of message sequences to
                 be generated, the exploration of that space through the
                 use of dynamic symbolic execution, and the timing and
                 analysis of the generated tests to identify problematic
                 frequencies. We implemented our approach in the context
                 of the popular Robotic Operating System and
                 investigated its application to three systems of
                 increasing complexity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Gu:2012:RDK,
  author =       "Zhongxian Gu and Earl T. Barr and Drew Schleck and
                 Zhendong Su",
  title =        "Reusing debugging knowledge via trace-based bug
                 search",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "927--942",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384684",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Some bugs, among the millions that exist, are similar
                 to each other. One bug-fixing tactic is to search for
                 similar bugs that have been reported and resolved in
                 the past. A fix for a similar bug can help a developer
                 understand a bug, or even directly fix it. Studying
                 bugs with similar symptoms, programmers may determine
                 how to detect or resolve them. To speed debugging, we
                 advocate the systematic capture and reuse of debugging
                 knowledge, much of which is currently wasted. The core
                 challenge here is how to search for similar bugs. To
                 tackle this problem, we exploit semantic bug
                 information in the form of execution traces, which
                 precisely capture bug semantics. This paper introduces
                 novel tool and language support for semantically
                 querying and analyzing bugs. We describe OSCILLOSCOPE,
                 an Eclipse plugin, that uses a bug trace to
                 exhaustively search its database for similar bugs and
                 return their bug reports. OSCILLOSCOPE displays the
                 traces of the bugs it returns against the trace of the
                 target bug, so a developer can visually examine the
                 quality of the matches. OSCILLOSCOPE rests on our bug
                 query language (BQL), a flexible query language over
                 traces. To realize OSCILLOSCOPE, we developed an open
                 infrastructure that consists of a trace collection
                 engine, BQL, a Hadoop-based query engine for BQL, a
                 trace-indexed bug database, as well as a web-based
                 frontend. OSCILLOSCOPE records and uploads bug traces
                 to its infrastructure; it does so automatically when a
                 JUnit test fails. We evaluated OSCILLOSCOPE on bugs
                 collected from popular open-source projects. We show
                 that OSCILLOSCOPE accurately and efficiently finds
                 similar bugs, some of which could have been immediately
                 used to fix open bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Strickland:2012:CIR,
  author =       "T. Stephen Strickland and Sam Tobin-Hochstadt and
                 Robert Bruce Findler and Matthew Flatt",
  title =        "Chaperones and impersonators: run-time support for
                 reasonable interposition",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "943--962",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384685",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Chaperones and impersonators provide run-time support
                 for interposing on primitive operations such as
                 function calls, array access and update, and structure
                 field access and update. Unlike most interposition
                 support, chaperones and impersonators are restricted so
                 that they constrain the behavior of the interposing
                 code to reasonable interposition, which in practice
                 preserves the abstraction mechanisms and reasoning that
                 programmers and compiler analyses rely on. Chaperones
                 and impersonators are particularly useful for
                 implementing contracts, and our implementation in
                 Racket allows us to improve both the expressiveness and
                 the performance of Racket's contract system.
                 Specifically, contracts on mutable data can be enforced
                 without changing the API to that data; contracts on
                 large data structures can be checked lazily on only the
                 accessed parts of the structure; contracts on objects
                 and classes can be implemented with lower overhead; and
                 contract wrappers can preserve object equality where
                 appropriate. With this extension, gradual typing
                 systems, such as Typed Racket, that rely on contracts
                 for interoperation with untyped code can now pass
                 mutable values safely between typed and untyped
                 modules.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Solodkyy:2012:OET,
  author =       "Yuriy Solodkyy and Gabriel {Dos Reis} and Bjarne
                 Stroustrup",
  title =        "Open and efficient type switch for {C++}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "963--982",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384686",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Selecting operations based on the run-time type of an
                 object is key to many object-oriented and functional
                 programming techniques. We present a technique for
                 implementing open and efficient type switching on
                 hierarchical extensible data types. The technique is
                 general and copes well with C++ multiple inheritance.
                 To simplify experimentation and gain realistic
                 performance using production-quality compilers and tool
                 chains, we implement a type switch construct as an ISO
                 C++11 library, called Mach7. This library-only
                 implementation provides concise notation and
                 outperforms the visitor design pattern, commonly used
                 for case analysis on types in object-oriented
                 programming. For closed sets of types, its performance
                 roughly equals equivalent code in functional languages,
                 such as OCaml and Haskell. The type-switching code is
                 easier to use and is more expressive than hand-coded
                 visitors are. The library is non-intrusive and
                 circumvents most of the extensibility restrictions
                 typical of the visitor design pattern. It was motivated
                 by applications involving large, typed, abstract syntax
                 trees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Tamayo:2012:UBD,
  author =       "Juan M. Tamayo and Alex Aiken and Nathan Bronson and
                 Mooly Sagiv",
  title =        "Understanding the behavior of database operations
                 under program control",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "983--996",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384688",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Applications that combine general program logic with
                 persistent databases (e.g., three-tier applications)
                 often suffer large performance penalties from poor use
                 of the database. We introduce a program analysis
                 technique that combines information flow in the program
                 with commutativity analysis of its database operations
                 to produce a unified dependency graph for database
                 statements, which provides programmers with a
                 high-level view of how costly database operations are
                 and how they are connected in the program. As an
                 example application of our analysis we describe three
                 optimizations that can be discovered by examining the
                 structure of the dependency graph; each helps remove
                 communication latency from the critical path of a
                 multi-tier system. We implement our technique in a tool
                 for Java applications using JDBC and experimentally
                 validate it using the multi-tier component of the
                 Dacapo benchmark.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Mishne:2012:TBS,
  author =       "Alon Mishne and Sharon Shoham and Eran Yahav",
  title =        "Typestate-based semantic code search over partial
                 programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "997--1016",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384689",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a novel code search approach for answering
                 queries focused on API-usage with code showing how the
                 API should be used. To construct a search index, we
                 develop new techniques for statically mining and
                 consolidating temporal API specifications from code
                 snippets. In contrast to existing semantic-based
                 techniques, our approach handles partial programs in
                 the form of code snippets. Handling snippets allows us
                 to consume code from various sources such as parts of
                 open source projects, educational resources (e.g.
                 tutorials), and expert code sites. To handle code
                 snippets, our approach (i) extracts a possibly partial
                 temporal specification from each snippet using a
                 relatively precise static analysis tracking a
                 generalized notion of typestate, and (ii) consolidates
                 the partial temporal specifications, combining
                 consistent partial information to yield consolidated
                 temporal specifications, each of which captures a
                 full(er) usage scenario. To answer a search query, we
                 define a notion of relaxed inclusion matching a query
                 against temporal specifications and their corresponding
                 code snippets. We have implemented our approach in a
                 tool called PRIME and applied it to search for API
                 usage of several challenging APIs. PRIME was able to
                 analyze and consolidate thousands of snippets per
                 tested API, and our results indicate that the
                 combination of a relatively precise analysis and
                 consolidation allowed PRIME to answer challenging
                 queries effectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{Xu:2012:FRD,
  author =       "Guoqing Xu",
  title =        "Finding reusable data structures",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "10",
  pages =        "1017--1034",
  month =        oct,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2398857.2384690",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Nov 15 16:40:23 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A big source of run-time performance problems in
                 large-scale, object-oriented applications is the
                 frequent creation of data structures (by the same
                 allocation site) whose lifetimes are disjoint, and
                 whose shapes and data content are always the same.
                 Constructing these data structures and computing the
                 same data values many times is expensive; significant
                 performance improvements can be achieved by reusing
                 their instances, shapes, and/or data values rather than
                 reconstructing them. This paper presents a run-time
                 technique that can be used to help programmers find
                 allocation sites that create such data structures to
                 improve performance. At the heart of the technique are
                 three reusability definitions and novel summarization
                 approaches that compute summaries for data structures
                 based on these definitions. The computed summaries are
                 used subsequently to find data structures that have
                 disjoint lifetimes, and/or that have the same shapes
                 and content. We have implemented this technique in the
                 Jikes RVM and performed extensive studies on
                 large-scale, real-world programs. We describe our
                 experience using six case studies, in which we have
                 achieved large performance gains by fixing problems
                 reported by our tool.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '12 conference proceedings.",
}

@Article{OCallahan:2012:WYW,
  author =       "Robert O'Callahan",
  title =        "Why is your {Web} browser using so much memory?",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "1--2",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2258998",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Browsers are the operating systems of the Web. They
                 support a vast universe of applications written in a
                 modern garbage-collected programming language. Browsers
                 expose a rich platform API mostly implemented in C++.
                 Browsers are also consumer software with low switching
                 costs in an intensely competitive market. Thus in
                 addition to standard requirements such as maximizing
                 throughput and minimizing latency, browsers have to
                 consider issues like-when the user closes a window
                 while watching Task Manager, they want to see memory
                 usage go down. Browsers have to compete to minimize
                 memory usage even for poorly written applications. In
                 this talk I will elucidate these requirements and
                 describe how Firefox and other browsers address them. I
                 will pay particular attention to issues that we don't
                 know how to solve, and that could benefit from research
                 attention.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhou:2012:MMM,
  author =       "Jin Zhou and Brian Demsky",
  title =        "Memory management for many-core processors with
                 software configurable locality policies",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "3--14",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259000",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "As processors evolve towards higher core counts,
                 architects will develop more sophisticated memory
                 systems to satisfy the cores' increasing thirst for
                 memory bandwidth. Early many-core processor designs
                 suggest that future memory systems will likely include
                 multiple controllers and distributed cache coherence
                 protocols. Many-core processors that expose memory
                 locality policies to the software system provide
                 opportunities for automatic tuning that can achieve
                 significant performance benefits. Managed languages
                 typically provide a simple heap abstraction. This paper
                 presents techniques that bridge the gap between the
                 simple heap abstraction of modern languages and the
                 complicated memory systems of future processors. We
                 present a NUMA-aware approach to garbage collection
                 that balances the competing concerns of data locality
                 and heap utilization to improve performance. We combine
                 a lightweight approach for measuring an application's
                 memory behavior with an online, adaptive algorithm for
                 tuning the cache to optimize it for the specific
                 application's behaviors. We have implemented our
                 garbage collector and cache tuning algorithm and
                 present results on a 64-core TILEPro64 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lyberis:2012:MMA,
  author =       "Spyros Lyberis and Polyvios Pratikakis and Dimitrios
                 S. Nikolopoulos and Martin Schulz and Todd Gamblin and
                 Bronis R. de Supinski",
  title =        "The {Myrmics} memory allocator: hierarchical,
                 message-passing allocation for global address spaces",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "15--24",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259001",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Constantly increasing hardware parallelism poses more
                 and more challenges to programmers and language
                 designers. One approach to harness the massive
                 parallelism is to move to task-based programming models
                 that rely on runtime systems for dependency analysis
                 and scheduling. Such models generally benefit from the
                 existence of a global address space. This paper
                 presents the parallel memory allocator of the Myrmics
                 runtime system, in which multiple allocator instances
                 organized in a tree hierarchy cooperate to implement a
                 global address space with dynamic region support on
                 distributed memory machines. The Myrmics hierarchical
                 memory allocator is step towards improved productivity
                 and performance in parallel programming. Productivity
                 is improved through the use of dynamic regions in a
                 global address space, which provide a convenient shared
                 memory abstraction for dynamic and irregular data
                 structures. Performance is improved through scaling on
                 manycore systems without system-wide cache coherency.
                 We evaluate the stand-alone allocator on an MPI-based
                 x86 cluster and find that it scales well for up to 512
                 worker cores, while it can outperform Unified Parallel
                 C by a factor of 3.7-10.7x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Maas:2012:GOO,
  author =       "Martin Maas and Philip Reames and Jeffrey Morlan and
                 Krste Asanovi{\'c} and Anthony D. Joseph and John
                 Kubiatowicz",
  title =        "{GPUs} as an opportunity for offloading garbage
                 collection",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "25--36",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259002",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "GPUs have become part of most commodity systems.
                 Nonetheless, they are often underutilized when not
                 executing graphics-intensive or special-purpose
                 numerical computations, which are rare in consumer
                 workloads. Emerging architectures, such as integrated
                 CPU/GPU combinations, may create an opportunity to
                 utilize these otherwise unused cycles for offloading
                 traditional systems tasks. Garbage collection appears
                 to be a particularly promising candidate for
                 offloading, due to the popularity of managed languages
                 on consumer devices. We investigate the challenges for
                 offloading garbage collection to a GPU, by examining
                 the performance trade-offs for the mark phase of a mark
                 \& sweep garbage collector. We present a theoretical
                 analysis and an algorithm that demonstrates the
                 feasibility of this approach. We also discuss a number
                 of algorithmic design trade-offs required to leverage
                 the strengths and capabilities of the GPU hardware. Our
                 algorithm has been integrated into the Jikes RVM and we
                 present promising performance results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yang:2012:BRF,
  author =       "Xi Yang and Stephen M. Blackburn and Daniel Frampton
                 and Antony L. Hosking",
  title =        "Barriers reconsidered, friendlier still!",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "37--48",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259004",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Read and write barriers mediate access to the heap
                 allowing the collector to control and monitor mutator
                 actions. For this reason, barriers are a powerful tool
                 in the design of any heap management algorithm, but the
                 prevailing wisdom is that they impose significant
                 costs. However, changes in hardware and workloads make
                 these costs a moving target. Here, we measure the cost
                 of a range of useful barriers on a range of modern
                 hardware and workloads. We confirm some old results and
                 overturn others. We evaluate the microarchitectural
                 sensitivity of barrier performance and the differences
                 among benchmark suites. We also consider barriers in
                 context, focusing on their behavior when used in
                 combination, and investigate a known pathology and
                 evaluate solutions. Our results show that read and
                 write barriers have average overheads as low as 5.4\%
                 and 0.9\% respectively. We find that barrier overheads
                 are more exposed on the workload provided by the modern
                 DaCapo benchmarks than on old SPECjvm98 benchmarks.
                 Moreover, there are differences in barrier behavior
                 between in-order and out-of- order machines, and their
                 respective memory subsystems, which indicate different
                 barrier choices for different platforms. These changing
                 costs mean that algorithm designers need to reconsider
                 their design choices and the nature of their resulting
                 algorithms in order to exploit the opportunities
                 presented by modern hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sivaramakrishnan:2012:ERB,
  author =       "KC Sivaramakrishnan and Lukasz Ziarek and Suresh
                 Jagannathan",
  title =        "Eliminating read barriers through procrastination and
                 cleanliness",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "49--60",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259005",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Managed languages typically use read barriers to
                 interpret forwarding pointers introduced to keep track
                 of copied objects. For example, in a multicore
                 environment with thread-local heaps and a global,
                 shared heap, an object initially allocated on a local
                 heap may be copied to a shared heap if it becomes the
                 source of a store operation whose target location
                 resides on the shared heap. As part of the copy
                 operation, a forwarding pointer may be established in
                 the original object to point to the copied object. This
                 level of indirection avoids the need to update all of
                 the references to the object that has been copied. In
                 this paper, we consider the design of a managed runtime
                 that eliminates read barriers. Our design is premised
                 on the availability of a sufficient degree of
                 concurrency to stall operations that would otherwise
                 necessitate the copy. Stalled actions are deferred
                 until the next local collection, avoiding exposing
                 forwarding pointers to the mutator. In certain
                 important cases, procrastination is unnecessary ---
                 lightweight runtime techniques can sometimes be used to
                 allow objects to be eagerly copied when their set of
                 incoming references is known, or when it can be
                 determined that having multiple copies would not
                 violate program semantics. We evaluate our techniques
                 on 3 platforms: a 16-core AMD64 machine, a 48-core
                 Intel SCC, and an 864-core Azul Vega 3. Experimental
                 results over a range of parallel benchmarks indicate
                 that our approach leads to notable performance gains
                 (20 --- 32\% on average) without incurring any
                 additional complexity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Iyengar:2012:SCP,
  author =       "Balaji Iyengar and Edward Gehringer and Michael Wolf
                 and Karthikeyan Manivannan",
  title =        "Scalable concurrent and parallel mark",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "61--72",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259006",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Parallel marking algorithms use multiple threads to
                 walk through the object heap graph and mark each
                 reachable object as live. Parallel marker threads mark
                 an object ``live'' by atomically setting a bit in a
                 mark-bitmap or a bit in the object header. Most of
                 these parallel algorithms strive to improve the marking
                 throughput by using work-stealing algorithms for
                 load-balancing and to ensure that all participating
                 threads are kept busy. A purely ``processor-centric''
                 load-balancing approach in conjunction with a need to
                 atomically set the mark bit, results in significant
                 contention during parallel marking. This limits the
                 scalability and throughput of parallel marking
                 algorithms. We describe a new non-blocking and
                 lock-free, work-sharing algorithm, the primary goal
                 being to reduce contention during atomic updates of the
                 mark-bitmap by parallel task-threads. Our work-sharing
                 mechanism uses the address of a word in the mark-bitmap
                 as the key to stripe work among parallel task-threads,
                 with only a subset of the task-threads working on each
                 stripe. This filters out most of the contention during
                 parallel marking with 20\% improvements in performance.
                 In case of concurrent and on-the-fly collector
                 algorithms, mutator threads also generate marking-work
                 for the marking task-threads. In these schemes, mutator
                 threads are also provided with thread-local marking
                 stacks where they collect references to potentially
                 ``gray'' objects, i.e., objects that haven't been
                 ``marked-through'' by the collector. We note that since
                 this work is generated by mutators when they reference
                 these objects, there is a high likelihood that these
                 objects continue to be present in the processor cache.
                 We describe and evaluate a scheme to distribute mutator
                 generated marking work among the collector's
                 task-threads that is cognizant of the processor and
                 cache topology. We prototype both our algorithms within
                 the C4 [28] collector that ships as part of an
                 industrial strength JVM for the Linux-X86 platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shahriyar:2012:CGR,
  author =       "Rifat Shahriyar and Stephen M. Blackburn and Daniel
                 Frampton",
  title =        "Down for the count? {Getting} reference counting back
                 in the ring",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "73--84",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259008",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Reference counting and tracing are the two fundamental
                 approaches that have underpinned garbage collection
                 since 1960. However, despite some compelling
                 advantages, reference counting is almost completely
                 ignored in implementations of high performance systems
                 today. In this paper we take a detailed look at
                 reference counting to understand its behavior and to
                 improve its performance. We identify key design choices
                 for reference counting and analyze how the behavior of
                 a wide range of benchmarks might affect design
                 decisions. As far as we are aware, this is the first
                 such quantitative study of reference counting. We use
                 insights gleaned from this analysis to introduce a
                 number of optimizations that significantly improve the
                 performance of reference counting. We find that an
                 existing modern implementation of reference counting
                 has an average 30\% overhead compared to tracing, and
                 that in combination, our optimizations are able to
                 completely eliminate that overhead. This brings the
                 performance of reference counting on par with that of a
                 well tuned mark-sweep collector. We keep our in-depth
                 analysis of reference counting as general as possible
                 so that it may be useful to other garbage collector
                 implementers. Our finding that reference counting can
                 be made directly competitive with well tuned mark-sweep
                 should shake the community's prejudices about reference
                 counting and perhaps open new opportunities for
                 exploiting reference counting's strengths, such as
                 localization and immediacy of reclamation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Iyengar:2012:CWF,
  author =       "Balaji Iyengar and Gil Tene and Michael Wolf and
                 Edward Gehringer",
  title =        "The {Collie}: a wait-free compacting collector",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "85--96",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259009",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "We describe the Collie collector, a fully concurrent
                 compacting collector that uses transactional memory
                 techniques to achieve wait-free compaction. The
                 collector uses compaction as the primary means of
                 reclaiming unused memory, and performs ``individual
                 object transplantations'' as transactions. We introduce
                 new terms and requirements useful for analyzing
                 concurrent relocating collectors, including definitions
                 of referrer sets, object transplantation and the notion
                 of individually transplantable objects. The Collie
                 collector builds on these terms and on a detailed
                 analysis of an object's legal states during compaction.
                 Collie uses a combination of read barriers, write
                 barriers and transactional memory operations. Its
                 read-barrier supports fast, direct object referencing
                 while using a bound, constant time, wait- free
                 triggering path. Collie thereby avoids the constant
                 indirection cost of Brooks [9] style barriers or
                 handle-based heaps [25]. Collie is demonstrated using
                 speculative multi-address atomicity [11], a form of
                 hardware transactional memory supported by the Azul
                 Vega architecture [2]. We evaluate the Collie collector
                 on the Azul platform, on which previous concurrent
                 collectors such as the Pauseless Collector [12] and its
                 generational variant [30] have been commercially
                 available for several years. We discuss Collie's
                 performance while running sustained workloads, and
                 compare it to the Pauseless collector on the same
                 platform. The Collie collector provides significant MMU
                 [5] improvements even in the 1-msec time windows
                 compared to the Pauseless collector. At the same time,
                 it matches Pauseless in throughput and in the ability
                 to scale to large heap sizes. We believe that the
                 Collie collector is the first garbage collector to
                 leverage hardware-assisted transactional memory. While
                 Collie directly leverages Vega's speculative
                 multi-address atomicity feature (SMA) [11], its design
                 can be easily adapted to other hardware- assisted
                 transactional memory systems. Specifically, the
                 upcoming Intel TSX instruction set extensions [21]
                 include capabilities similar to SMA. We expect Collie
                 to be easily implementable on future commodity servers
                 based on Intel Haswell processors and following
                 processor generations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sewe:2012:NSI,
  author =       "Andreas Sewe and Mira Mezini and Aibek Sarimbekov and
                 Danilo Ansaloni and Walter Binder and Nathan Ricci and
                 Samuel Z. Guyer",
  title =        "{{\tt New Scala() instanceof}} {Java}: a comparison of
                 the memory behaviour of {Java} and {Scala} programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "97--108",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259010",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "While often designed with a single language in mind,
                 managed runtimes like the Java virtual machine (JVM)
                 have become the target of not one but many languages,
                 all of which benefit from the runtime's services. One
                 of these services is automatic memory management. In
                 this paper, we compare and contrast the memory
                 behaviour of programs written in Java and Scala,
                 respectively, two languages which both target the same
                 platform: the JVM. We both analyze core object
                 demographics like object lifetimes as well as secondary
                 properties of objects like their associated monitors
                 and identity hash-codes. We find that objects in Scala
                 programs have lower survival rates and higher rates of
                 immutability, which is only partly explained by the
                 memory behaviour of objects representing closures or
                 boxed primitives. Other metrics vary more by benchmark
                 than language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gu:2012:GTC,
  author =       "Xiaoming Gu and Chen Ding",
  title =        "A generalized theory of collaborative caching",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "109--120",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259012",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Collaborative caching allows software to use hints to
                 influence cache management in hardware. Previous
                 theories have shown that such hints observe the
                 inclusion property and can obtain optimal caching if
                 the access sequence and the cache size are known ahead
                 of time. Previously, the interface of a cache hint is
                 limited, e.g., a binary choice between LRU and MRU. In
                 this paper, we generalize the hint interface, where a
                 hint is a number encoding a priority. We show the
                 generality in a hierarchical relation where
                 collaborative caching subsumes non-collaborative
                 caching, and within collaborative caching, the priority
                 hint subsumes the previous binary hint. We show two
                 theoretical results for the general hint. The first is
                 a new cache replacement policy, priority LRU, which
                 permits the complete range of choices between MRU and
                 LRU. We prove a new type of inclusion
                 property---non-uniform inclusion---and give a one-pass
                 algorithm to compute the miss rate for all cache sizes.
                 Second, we show that priority hints can enable the use
                 of the same hints to obtain optimal caching for all
                 cache sizes, without having to know the cache size
                 beforehand.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nasre:2012:ESC,
  author =       "Rupesh Nasre",
  title =        "Exploiting the structure of the constraint graph for
                 efficient points-to analysis",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "121--132",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259013",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Points-to analysis is a key compiler analysis. Several
                 memory related optimizations use points-to information
                 to improve their effectiveness. Points-to analysis is
                 performed by building a constraint graph of pointer
                 variables and dynamically updating it to propagate more
                 and more points-to information across its subset edges.
                 So far, the structure of the constraint graph has been
                 only trivially exploited for efficient propagation of
                 information, e.g., in identifying cyclic components or
                 to propagate information in topological order. We
                 perform a careful study of its structure and propose a
                 new inclusion-based flow-insensitive context-sensitive
                 points-to analysis algorithm based on the notion of
                 dominant pointers. We also propose a new kind of
                 pointer-equivalence based on dominant pointers which
                 provides significantly more opportunities for reducing
                 the number of pointers tracked during the analysis.
                 Based on this hitherto unexplored form of
                 pointer-equivalence, we develop a new context-sensitive
                 flow insensitive points-to analysis algorithm which
                 uses incremental dominator update to efficiently
                 compute points-to information. Using a large suite of
                 programs consisting of SPEC 2000 benchmarks and five
                 large open source programs we show that our points-to
                 analysis is 88\% faster than BDD-based Lazy Cycle
                 Detection and $ 2 \times $ faster than Deep
                 Propagation. We argue that our approach of detecting
                 dominator-based pointer-equivalence is a key to improve
                 points-to analysis efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Inoue:2012:ISC,
  author =       "Hiroshi Inoue and Toshio Nakatani",
  title =        "Identifying the sources of cache misses in {Java}
                 programs without relying on hardware counters",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "11",
  pages =        "133--142",
  month =        nov,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2426642.2259014",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jan 10 08:55:30 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '12 conference proceedings.",
  abstract =     "Cache miss stalls are one of the major sources of
                 performance bottlenecks for multicore processors. A
                 Hardware Performance Monitor (HPM) in the processor is
                 useful for locating the cache misses, but is rarely
                 used in the real world for various reasons. It would be
                 better to find a simple approach to locate the sources
                 of cache misses and apply runtime optimizations without
                 relying on an HPM. This paper shows that pointer
                 dereferencing in hot loops is a major source of cache
                 misses in Java programs. Based on this observation, we
                 devised a new approach to identify the instructions and
                 objects that cause frequent cache misses. Our heuristic
                 technique effectively identifies the majority of the
                 cache misses in typical Java programs by matching the
                 hot loops to simple idiomatic code patterns. On
                 average, our technique selected only 2.8\% of the load
                 and store instructions generated by the JIT compiler
                 and these instructions accounted for 47\% of the L1D
                 cache misses and 49\% of the L2 cache misses caused by
                 the JIT-compiled code. To prove the effectiveness of
                 our technique in compiler optimizations, we prototyped
                 object placement optimizations, which align objects in
                 cache lines or collocate paired objects in the same
                 cache line to reduce cache misses. For comparison, we
                 also implemented the same optimizations based on the
                 accurate information obtained from the HPM. Our results
                 showed that our heuristic approach was as effective as
                 the HPM-based approach and achieved comparable
                 performance improvements in the {\tt SPECjbb2005} and
                 {\tt SPECpower\_ssj2008} benchmark programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Farmer:2012:HMP,
  author =       "Andrew Farmer and Andy Gill and Ed Komp and Neil
                 Sculthorpe",
  title =        "The {HERMIT} in the machine: a plugin for the
                 interactive transformation of {GHC} core language
                 programs",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "1--12",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364508",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "The importance of reasoning about and refactoring
                 programs is a central tenet of functional programming.
                 Yet our compilers and development toolchains only
                 provide rudimentary support for these tasks. This paper
                 introduces a programmatic and compiler-centric
                 interface that facilitates refactoring and equational
                 reasoning. To develop our ideas, we have implemented
                 HERMIT, a toolkit enabling informal but systematic
                 transformation of Haskell programs from inside the
                 Glasgow Haskell Compiler's optimization pipeline. With
                 HERMIT, users can experiment with optimizations and
                 equational reasoning, while the tedious heavy lifting
                 of performing the actual transformations is done for
                 them. HERMIT provides a transformation API that can be
                 used to build higher-level rewrite tools. One use-case
                 is prototyping new optimizations as clients of this API
                 before being committed to the GHC toolchain. We
                 describe a HERMIT application --- a read-eval-print
                 shell for performing transformations using HERMIT. We
                 also demonstrate using this shell to prototype an
                 optimization on a specific example, and report our
                 initial experiences and remaining challenges.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Adams:2012:TYB,
  author =       "Michael D. Adams and Thomas M. DuBuisson",
  title =        "Template your boilerplate: using {Template Haskell}
                 for efficient generic programming",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "13--24",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364509",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "Generic programming allows the concise expression of
                 algorithms that would otherwise require large amounts
                 of handwritten code. A number of such systems have been
                 developed over the years, but a common drawback of
                 these systems is poor runtime performance relative to
                 handwritten, non-generic code. Generic-programming
                 systems vary significantly in this regard, but few
                 consistently match the performance of handwritten code.
                 This poses a dilemma for developers.
                 Generic-programming systems offer concision but cost
                 performance. Handwritten code offers performance but
                 costs concision. This paper explores the use of
                 Template Haskell to achieve the best of both worlds. It
                 presents a generic-programming system for Haskell that
                 provides both the concision of other
                 generic-programming systems and the efficiency of
                 handwritten code. Our system gives the programmer a
                 high-level, generic-programming interface, but uses
                 Template Haskell to generate efficient, non-generic
                 code that outperforms existing generic-programming
                 systems for Haskell. This paper presents the results of
                 benchmarking our system against both handwritten code
                 and several other generic-programming systems. In these
                 benchmarks, our system matches the performance of
                 handwritten code while other systems average anywhere
                 from two to twenty times slower.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lippmeier:2012:GPA,
  author =       "Ben Lippmeier and Manuel Chakravarty and Gabriele
                 Keller and Simon Peyton Jones",
  title =        "Guiding parallel array fusion with indexed types",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "25--36",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364511",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "We present a refined approach to parallel array fusion
                 that uses indexed types to specify the internal
                 representation of each array. Our approach aids the
                 client programmer in reasoning about the performance of
                 their program in terms of the source code. It also
                 makes the intermediate code easier to transform at
                 compile-time, resulting in faster compilation and more
                 reliable runtimes. We demonstrate how our new approach
                 improves both the clarity and performance of several
                 end-user written programs, including a fluid flow
                 solver and an interpolator for volumetric data.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Keller:2012:VA,
  author =       "Gabriele Keller and Manuel M. T. Chakravarty and Roman
                 Leshchinskiy and Ben Lippmeier and Simon Peyton Jones",
  title =        "Vectorisation avoidance",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "37--48",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "Flattening nested parallelism is a vectorising code
                 transform that converts irregular nested parallelism
                 into flat data parallelism. Although the result has
                 good asymptotic performance, flattening thoroughly
                 restructures the code. Many intermediate data
                 structures and traversals are introduced, which may or
                 may not be eliminated by subsequent optimisation. We
                 present a novel program analysis to identify parts of
                 the program where flattening would only introduce
                 overhead, without appropriate gain. We present
                 empirical evidence that avoiding vectorisation in these
                 cases leads to more efficient programs than if we had
                 applied vectorisation and then relied on array fusion
                 to eliminate intermediates from the resulting code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jeuring:2012:TTC,
  author =       "Johan Jeuring and Patrik Jansson and Cl{\'a}udio
                 Amaral",
  title =        "Testing type class laws",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "49--60",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364514",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "The specification of a class in Haskell often starts
                 with stating, in comments, the laws that should be
                 satisfied by methods defined in instances of the class,
                 followed by the type of the methods of the class. This
                 paper develops a framework that supports testing such
                 class laws using QuickCheck. Our framework is a
                 light-weight class law testing framework, which
                 requires a limited amount of work per class law, and
                 per datatype for which the class law is tested. We also
                 show how to test class laws with partially-defined
                 values. Using partially-defined values, we show that
                 the standard lazy and strict implementations of the
                 state monad do not satisfy the expected laws.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Duregaard:2012:FFE,
  author =       "Jonas Dureg{\aa}rd and Patrik Jansson and Meng Wang",
  title =        "{Feat}: functional enumeration of algebraic types",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "61--72",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364515",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "In mathematics, an enumeration of a set S is a
                 bijective function from (an initial segment of) the
                 natural numbers to S. We define ``functional
                 enumerations'' as efficiently computable such
                 bijections. This paper describes a theory of functional
                 enumeration and provides an algebra of enumerations
                 closed under sums, products, guarded recursion and
                 bijections. We partition each enumerated set into
                 numbered, finite subsets. We provide a generic
                 enumeration such that the number of each part
                 corresponds to the size of its values (measured in the
                 number of constructors). We implement our ideas in a
                 Haskell library called testing-feat, and make the
                 source code freely available. Feat provides efficient
                 ``random access'' to enumerated values. The primary
                 application is property-based testing, where it is used
                 to define both random sampling (for example QuickCheck
                 generators) and exhaustive enumeration (in the style of
                 SmallCheck). We claim that functional enumeration is
                 the best option for automatically generating test cases
                 from large groups of mutually recursive syntax tree
                 types. As a case study we use Feat to test the
                 pretty-printer of the Template Haskell library
                 (uncovering several bugs).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Claessen:2012:SSF,
  author =       "Koen Claessen",
  title =        "Shrinking and showing functions: (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "73--80",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364516",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "Although quantification over functions in QuickCheck
                 properties has been supported from the beginning,
                 displaying and shrinking them as counter examples has
                 not. The reason is that in general, functions are
                 infinite objects, which means that there is no sensible
                 show function for them, and shrinking an infinite
                 object within a finite number of steps seems
                 impossible. This paper presents a general technique
                 with which functions as counter examples can be shrunk
                 to finite objects, which can then be displayed to the
                 user. The approach turns out to be practically usable,
                 which is shown by a number of examples. The two main
                 limitations are that higher-order functions cannot be
                 dealt with, and it is hard to deal with terms that
                 contain functions as subterms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Allen:2012:SDR,
  author =       "Wyatt Allen and Martin Erwig",
  title =        "{Surveyor}: a {DSEL} for representing and analyzing
                 strongly typed surveys",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "81--90",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "Polls and surveys are increasingly employed to gather
                 information about attitudes and experiences of all
                 kinds of populations and user groups. The ultimate
                 purpose of a survey is to identify trends and
                 relationships that can inform decision makers. To this
                 end, the data gathered by a survey must be
                 appropriately analyzed. Most of the currently existing
                 tools focus on the user interface aspect of the data
                 collection task, but pay little attention to the
                 structure and type of the collected data, which are
                 usually represented as potentially tag-annotated, but
                 otherwise unstructured, plain text. This makes the task
                 of writing data analysis programs often difficult and
                 error-prone, whereas a typed data representation could
                 support the writing of type-directed data analysis
                 tools that would enjoy the many benefits of static
                 typing. In this paper we present Surveyor, a DSEL that
                 allows the compositional construction of typed surveys,
                 where the types describe the structure of the data to
                 be collected. A survey can be run to gather typed data,
                 which can then be subjected to analysis tools that are
                 built using Surveyor's typed combinators. Altogether
                 the Surveyor DSEL realizes a strongly typed and
                 type-directed approach to data gathering and analysis.
                 The implementation of our DSEL is based on GADTs to
                 allow a flexible, yet strongly typed representation of
                 surveys. Moreover, the implementation employs the
                 Scrap-Your-Boilerplate library to facilitate the
                 type-dependent traversal, extraction, and combination
                 of data gathered from surveys.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Winograd-Cort:2012:WIE,
  author =       "Daniel Winograd-Cort and Paul Hudak",
  title =        "Wormholes: introducing effects to {FRP}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "91--104",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364519",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "Functional reactive programming (FRP) is a useful
                 model for programming real-time and reactive systems in
                 which one defines a signal function to process a stream
                 of input values into a stream of output values.
                 However, performing side effects (e.g. memory mutation
                 or input/output) in this model is tricky and typically
                 unsafe. In previous work, Winograd-Cort et al. [2012]
                 introduced resource types and wormholes to address this
                 problem. This paper better motivates, expands upon, and
                 formalizes the notion of a wormhole to fully unlock its
                 potential. We show, for example, that wormholes can be
                 used to define the concept of causality. This in turn
                 allows us to provide behaviors such as looping, a core
                 component of most languages, without building it
                 directly into the language. We also improve upon our
                 previous design by making wormholes less verbose and
                 easier to use. To formalize the notion of a wormhole,
                 we define an extension to the simply typed lambda
                 calculus, complete with typing rules and operational
                 semantics. In addition, we present a new form of
                 semantic transition that we call a temporal transition
                 to specify how an FRP program behaves over time and to
                 allow us to better reason about causality. As our model
                 is designed for a Haskell implementation, the semantics
                 are lazy. Finally, with the language defined, we prove
                 that our wormholes indeed allow side effects to be
                 performed safely in an FRP framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yorgey:2012:MTV,
  author =       "Brent A. Yorgey",
  title =        "{Monoids}: theme and variations (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "105--116",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364520",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "The monoid is a humble algebraic structure, at first
                 glance even downright boring. However, there's much
                 more to monoids than meets the eye. Using examples
                 taken from the diagrams vector graphics framework as a
                 case study, I demonstrate the power and beauty of
                 monoids for library design. The paper begins with an
                 extremely simple model of diagrams and proceeds through
                 a series of incremental variations, all related somehow
                 to the central theme of monoids. Along the way, I
                 illustrate the power of compositional semantics; why
                 you should also pay attention to the monoid's even
                 humbler cousin, the semigroup; monoid homomorphisms;
                 and monoid actions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Eisenberg:2012:DTP,
  author =       "Richard A. Eisenberg and Stephanie Weirich",
  title =        "Dependently typed programming with singletons",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "117--130",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "Haskell programmers have been experimenting with
                 dependent types for at least a decade, using clever
                 encodings that push the limits of the Haskell type
                 system. However, the cleverness of these encodings is
                 also their main drawback. Although the ideas are
                 inspired by dependently typed programs, the code looks
                 significantly different. As a result, GHC implementors
                 have responded with extensions to Haskell's type
                 system, such as GADTs, type families, and datatype
                 promotion. However, there remains a significant
                 difference between programming in Haskell and in
                 full-spectrum dependently typed languages. Haskell
                 enforces a phase separation between runtime values and
                 compile-time types. Therefore, singleton types are
                 necessary to express the dependency between values and
                 types. These singleton types introduce overhead and
                 redundancy for the programmer. This paper presents the
                 singletons library, which generates the boilerplate
                 code necessary for dependently typed programming using
                 GHC. To compare with full-spectrum languages, we
                 present an extended example based on an Agda interface
                 for safe database access. The paper concludes with a
                 detailed discussion on the current capabilities of GHC
                 for dependently typed programming and suggestions for
                 future extensions to better support this style of
                 programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Swierstra:2012:XCE,
  author =       "Wouter Swierstra",
  title =        "{{\tt xmonad}} in {Coq} (experience report):
                 programming a window manager in a proof assistant",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "131--136",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364523",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "This report documents the insights gained from
                 implementing the core functionality of xmonad, a
                 popular window manager written in Haskell, in the Coq
                 proof assistant. Rather than focus on verification,
                 this report outlines the technical challenges involved
                 with incorporating Coq code in a Haskell project.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Terei:2012:SH,
  author =       "David Terei and Simon Marlow and Simon Peyton Jones
                 and David Mazi{\`e}res",
  title =        "{Safe Haskell}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "137--148",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364524",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "Though Haskell is predominantly type-safe,
                 implementations contain a few loopholes through which
                 code can bypass typing and module encapsulation. This
                 paper presents Safe Haskell, a language extension that
                 closes these loopholes. Safe Haskell makes it possible
                 to confine and safely execute untrusted, possibly
                 malicious code. By strictly enforcing types, Safe
                 Haskell allows a variety of different policies from API
                 sandboxing to information-flow control to be
                 implemented easily as monads. Safe Haskell is aimed to
                 be as unobtrusive as possible. It enforces properties
                 that programmers tend to meet already by convention. We
                 describe the design of Safe Haskell and an
                 implementation (currently shipping with GHC) that
                 infers safety for code that lies in a safe subset of
                 the language. We use Safe Haskell to implement an
                 online Haskell interpreter that can securely execute
                 arbitrary untrusted code with no overhead. The use of
                 Safe Haskell greatly simplifies this task and allows
                 the use of a large body of existing code and tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Erdweg:2012:LSL,
  author =       "Sebastian Erdweg and Felix Rieger and Tillmann Rendel
                 and Klaus Ostermann",
  title =        "Layout-sensitive language extensibility with
                 {SugarHaskell}",
  journal =      j-SIGPLAN,
  volume =       "47",
  number =       "12",
  pages =        "149--160",
  month =        dec,
  year =         "2012",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2430532.2364526",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jan 18 18:22:13 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '12 conference proceedings.",
  abstract =     "Programmers need convenient syntax to write elegant
                 and concise programs. Consequently, the Haskell
                 standard provides syntactic sugar for some scenarios
                 (e.g., do notation for monadic code), authors of
                 Haskell compilers provide syntactic sugar for more
                 scenarios (e.g., arrow notation in GHC), and some
                 Haskell programmers implement preprocessors for their
                 individual needs (e.g., idiom brackets in SHE). But
                 manually written preprocessors cannot scale: They are
                 expensive, error-prone, and not composable. Most
                 researchers and programmers therefore refrain from
                 using the syntactic notations they need in actual
                 Haskell programs, but only use them in documentation or
                 papers. We present a syntactically extensible version
                 of Haskell, SugarHaskell, that empowers ordinary
                 programmers to implement and use custom syntactic
                 sugar. Building on our previous work on syntactic
                 extensibility for Java, SugarHaskell integrates
                 syntactic extensions as sugar libraries into Haskell's
                 module system. Syntax extensions in SugarHaskell can
                 declare arbitrary context-free and layout-sensitive
                 syntax. SugarHaskell modules are compiled into Haskell
                 modules and further processed by a Haskell compiler. We
                 provide an Eclipse-based IDE for SugarHaskell that is
                 extensible, too, and automatically provides syntax
                 coloring for all syntax extensions imported into a
                 module. We have validated SugarHaskell with several
                 case studies, including arrow notation (as implemented
                 in GHC) and EBNF as a concise syntax for the
                 declaration of algebraic data types with associated
                 concrete syntax. EBNF declarations also show how to
                 extend the extension mechanism itself: They introduce
                 syntactic sugar for using the declared concrete syntax
                 in other SugarHaskell modules.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cousot:2013:EMO,
  author =       "Radhia Cousot",
  title =        "Engineering mathematics: the odd order theorem proof",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "1--2",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429071",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Even with the assistance of computer tools, the
                 formalized description and verification of
                 research-level mathematics remains a daunting task, not
                 least because of the talent with which mathematicians
                 combine diverse theories to achieve their ends. By
                 combining tools and techniques from type theory,
                 language design, and software engineering we have
                 managed to capture enough of these practices to
                 formalize the proof of the Odd Order theorem, a
                 landmark result in Group Theory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Losch:2013:FAN,
  author =       "Steffen L{\"o}sch and Andrew M. Pitts",
  title =        "Full abstraction for nominal {Scott} domains",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "3--14",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429073",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We develop a domain theory within nominal sets and
                 present programming language constructs and results
                 that can be gained from this approach. The development
                 is based on the concept of orbit-finite subset, that
                 is, a subset of a nominal sets that is both finitely
                 supported and contained in finitely many orbits. This
                 concept appears prominently in the recent research
                 programme of Bojanczyk et al. on automata over infinite
                 languages, and our results establish a connection
                 between their work and a characterisation of
                 topological compactness discovered, in a quite
                 different setting, by Winskel and Turner as part of a
                 nominal domain theory for concurrency. We use this
                 connection to derive a notion of Scott domain within
                 nominal sets. The functionals for existential
                 quantification over names and `definite description'
                 over names turn out to be compact in the sense
                 appropriate for nominal Scott domains. Adding them,
                 together with parallel-or, to a programming language
                 for recursively defined higher-order functions with
                 name abstraction and locally scoped names, we prove a
                 full abstraction result for nominal Scott domains
                 analogous to Plotkin's classic result about PCF and
                 conventional Scott domains: two program phrases have
                 the same observable operational behaviour in all
                 contexts if and only if they denote equal elements of
                 the nominal Scott domain model. This is the first full
                 abstraction result we know of for higher-order
                 functions with local names that uses a domain theory
                 based on ordinary extensional functions, rather than
                 using the more intensional approach of game
                 semantics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Tate:2013:SSP,
  author =       "Ross Tate",
  title =        "The sequential semantics of producer effect systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "15--26",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429074",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Effects are fundamental to programming languages. Even
                 the lambda calculus has effects, and consequently the
                 two famous evaluation strategies produce different
                 semantics. As such, much research has been done to
                 improve our understanding of effects. Since Moggi
                 introduced monads for his computational lambda
                 calculus, further generalizations have been designed to
                 formalize increasingly complex computational effects,
                 such as indexed monads followed by layered monads
                 followed by parameterized monads. This succession
                 prompted us to determine the most general formalization
                 possible. In searching for this formalization we came
                 across many surprises, such as the insufficiencies of
                 arrows, as well as many unexpected insights, such as
                 the importance of considering an effect as a small
                 component of a whole system rather than just an
                 isolated feature. In this paper we present our semantic
                 formalization for producer effect systems, which we
                 call a productor, and prove its maximal generality by
                 focusing on only sequential composition of effectful
                 computations, consequently guaranteeing that the
                 existing monadic techniques are specializations of
                 productors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Abel:2013:CPI,
  author =       "Andreas Abel and Brigitte Pientka and David Thibodeau
                 and Anton Setzer",
  title =        "{Copatterns}: programming infinite structures by
                 observations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "27--38",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429075",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Inductive datatypes provide mechanisms to define
                 finite data such as finite lists and trees via
                 constructors and allow programmers to analyze and
                 manipulate finite data via pattern matching. In this
                 paper, we develop a dual approach for working with
                 infinite data structures such as streams. Infinite data
                 inhabits coinductive datatypes which denote greatest
                 fixpoints. Unlike finite data which is defined by
                 constructors we define infinite data by observations.
                 Dual to pattern matching, a tool for analyzing finite
                 data, we develop the concept of copattern matching,
                 which allows us to synthesize infinite data. This leads
                 to a symmetric language design where pattern matching
                 on finite and infinite data can be mixed. We present a
                 core language for programming with infinite structures
                 by observations together with its operational semantics
                 based on (co)pattern matching and describe coverage of
                 copatterns. Our language naturally supports both
                 call-by-name and call-by-value interpretations and can
                 be seamlessly integrated into existing languages like
                 Haskell and ML. We prove type soundness for our
                 language and sketch how copatterns open new directions
                 for solving problems in the interaction of coinductive
                 and dependent types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Blelloch:2013:CEF,
  author =       "Guy E. Blelloch and Robert Harber",
  title =        "Cache and {I/O} efficient functional algorithms",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "39--50",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429077",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The widely studied I/O and ideal-cache models were
                 developed to account for the large difference in costs
                 to access memory at different levels of the memory
                 hierarchy. Both models are based on a two level memory
                 hierarchy with a fixed size primary memory(cache) of
                 size {$M$}, an unbounded secondary memory organized in
                 blocks of size {$B$}. The cost measure is based purely
                 on the number of block transfers between the primary
                 and secondary memory. All other operations are free.
                 Many algorithms have been analyzed in these models and
                 indeed these models predict the relative performance of
                 algorithms much more accurately than the standard RAM
                 model. The models, however, require specifying
                 algorithms at a very low level requiring the user to
                 carefully lay out their data in arrays in memory and
                 manage their own memory allocation. In this paper we
                 present a cost model for analyzing the memory
                 efficiency of algorithms expressed in a simple
                 functional language. We show how some algorithms
                 written in standard forms using just lists and trees
                 (no arrays) and requiring no explicit memory layout or
                 memory management are efficient in the model. We then
                 describe an implementation of the language and show
                 provable bounds for mapping the cost in our model to
                 the cost in the ideal-cache model. These bound imply
                 that purely functional programs based on lists and
                 trees with no special attention to any details of
                 memory layout can be as asymptotically as efficient as
                 the carefully designed imperative I/O efficient
                 algorithms. For example we describe an {$ O(n_B \log M
                 / B n_B) $} cost sorting algorithm, which is optimal in
                 the ideal cache and I/O models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Ben-Amram:2013:LRP,
  author =       "Amir M. Ben-Amram and Samir Genaim",
  title =        "On the linear ranking problem for integer
                 linear-constraint loops",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "51--62",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429078",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper we study the complexity of the Linear
                 Ranking problem: given a loop, described by linear
                 constraints over a finite set of integer variables, is
                 there a linear ranking function for this loop? While
                 existence of such a function implies termination, this
                 problem is not equivalent to termination. When the
                 variables range over the rationals or reals, the Linear
                 Ranking problem is known to be PTIME decidable.
                 However, when they range over the integers, whether for
                 single-path or multipath loops, the complexity of the
                 Linear Ranking problem has not yet been determined. We
                 show that it is coNP-complete. However, we point out
                 some special cases of importance of PTIME complexity.
                 We also present complete algorithms for synthesizing
                 linear ranking functions, both for the general case and
                 the special PTIME cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Mayr:2013:AAM,
  author =       "Richard Mayr and Lorenzo Clemente",
  title =        "Advanced automata minimization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "63--74",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429079",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an efficient algorithm to reduce the size
                 of nondeterministic Buchi word automata, while
                 retaining their language. Additionally, we describe
                 methods to solve PSPACE-complete automata problems like
                 universality, equivalence and inclusion for much larger
                 instances (1-3 orders of magnitude) than before. This
                 can be used to scale up applications of automata in
                 formal verification tools and decision procedures for
                 logical theories. The algorithm is based on new
                 transition pruning techniques. These use criteria based
                 on combinations of backward and forward trace
                 inclusions. Since these relations are themselves
                 PSPACE-complete, we describe methods to compute good
                 approximations of them in polynomial time. Extensive
                 experiments show that the average-case complexity of
                 our algorithm scales quadratically. The size reduction
                 of the automata depends very much on the class of
                 instances, but our algorithm consistently outperforms
                 all previous techniques by a wide margin. We tested our
                 algorithm on Buchi automata derived from LTL-formulae,
                 many classes of random automata and automata derived
                 from mutual exclusion protocols, and compared its
                 performance to the well-known automata tool GOAL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Unno:2013:ARC,
  author =       "Hiroshi Unno and Tachio Terauchi and Naoki Kobayashi",
  title =        "Automating relatively complete verification of
                 higher-order functional programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "75--86",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429081",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an automated approach to relatively
                 completely verifying safety (i.e., reachability)
                 property of higher-order functional programs. Our
                 contribution is two-fold. First, we extend the
                 refinement type system framework employed in the recent
                 work on (incomplete) automated higher-order
                 verification by drawing on the classical work on
                 relatively complete ``Hoare logic like'' program logic
                 for higher-order procedural languages. Then, by
                 adopting the recently proposed techniques for solving
                 constraints over quantified first-order logic formulas,
                 we develop an automated type inference method for the
                 type system, thereby realizing an automated relatively
                 complete verification of higher-order programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Atkey:2013:AIA,
  author =       "Robert Atkey and Patricia Johann and Andrew Kennedy",
  title =        "Abstraction and invariance for algebraically indexed
                 types",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "87--100",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429082",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reynolds' relational parametricity provides a powerful
                 way to reason about programs in terms of invariance
                 under changes of data representation. A dazzling array
                 of applications of Reynolds' theory exists, exploiting
                 invariance to yield ``free theorems'', non-inhabitation
                 results, and encodings of algebraic datatypes. Outside
                 computer science, invariance is a common theme running
                 through many areas of mathematics and physics. For
                 example, the area of a triangle is unaltered by
                 rotation or flipping. If we scale a triangle, then we
                 scale its area, maintaining an invariant relationship
                 between the two. The transformations under which
                 properties are invariant are often organised into
                 groups, with the algebraic structure reflecting the
                 composability and invertibility of transformations. In
                 this paper, we investigate programming languages whose
                 types are indexed by algebraic structures such as
                 groups of geometric transformations. Other examples
                 include types indexed by principals--for information
                 flow security--and types indexed by distances--for
                 analysis of analytic uniform continuity properties.
                 Following Reynolds, we prove a general Abstraction
                 Theorem that covers all these instances. Consequences
                 of our Abstraction Theorem include free theorems
                 expressing invariance properties of programs, type
                 isomorphisms based on invariance properties, and
                 non-definability results indicating when certain
                 algebraically indexed types are uninhabited or only
                 inhabited by trivial programs. We have fully formalised
                 our framework and most examples in Coq.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Benzaken:2013:SDS,
  author =       "V{\'e}ronique Benzaken and Giuseppe Castagna and Kim
                 Nguyen and J{\'e}r{\^o}me Sim{\'e}on",
  title =        "Static and dynamic semantics of {NoSQL} languages",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "101--114",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429083",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a calculus for processing semistructured
                 data that spans differences of application area among
                 several novel query languages, broadly categorized as
                 ``NoSQL''. This calculus lets users define their own
                 operators, capturing a wider range of data processing
                 capabilities, whilst providing a typing precision so
                 far typical only of primitive hard-coded operators. The
                 type inference algorithm is based on semantic type
                 checking, resulting in type information that is both
                 precise, and flexible enough to handle structured and
                 semistructured data. We illustrate the use of this
                 calculus by encoding a large fragment of Jaql,
                 including operations and iterators over JSON, embedded
                 SQL expressions, and co-grouping, and show how the
                 encoding directly yields a typing discipline for Jaql
                 as it is, namely without the addition of any type
                 definition or type annotation in the code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Cerny:2013:QAR,
  author =       "Pavol Cerny and Thomas A. Henzinger and Arjun
                 Radhakrishna",
  title =        "Quantitative abstraction refinement",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "115--128",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429085",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a general framework for abstraction with
                 respect to quantitative properties, such as worst-case
                 execution time, or power consumption. Our framework
                 provides a systematic way for counter-example guided
                 abstraction refinement for quantitative properties. The
                 salient aspect of the framework is that it allows
                 anytime verification, that is, verification algorithms
                 that can be stopped at any time (for example, due to
                 exhaustion of memory), and report approximations that
                 improve monotonically when the algorithms are given
                 more time. We instantiate the framework with a number
                 of quantitative abstractions and refinement schemes,
                 which differ in terms of how much quantitative
                 information they keep from the original system. We
                 introduce both state-based and trace-based quantitative
                 abstractions, and we describe conditions that define
                 classes of quantitative properties for which the
                 abstractions provide over-approximations. We give
                 algorithms for evaluating the quantitative properties
                 on the abstract systems. We present algorithms for
                 counter-example based refinements for quantitative
                 properties for both state-based and segment-based
                 abstractions. We perform a case study on worst-case
                 execution time of executables to evaluate the anytime
                 verification aspect and the quantitative abstractions
                 we proposed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Farzan:2013:IDF,
  author =       "Azadeh Farzan and Zachary Kincaid and Andreas
                 Podelski",
  title =        "Inductive data flow graphs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "129--142",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429086",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The correctness of a sequential program can be shown
                 by the annotation of its control flow graph with
                 inductive assertions. We propose inductive data flow
                 graphs, data flow graphs with incorporated inductive
                 assertions, as the basis of an approach to verifying
                 concurrent programs. An inductive data flow graph
                 accounts for a set of dependencies between program
                 actions in interleaved thread executions, and therefore
                 stands as a representation for the set of concurrent
                 program traces which give rise to these dependencies.
                 The approach first constructs an inductive data flow
                 graph and then checks whether all program traces are
                 represented. The size of the inductive data flow graph
                 is polynomial in the number of data dependencies (in a
                 sense that can be made formal); it does not grow
                 exponentially in the number of threads unless the data
                 dependencies do. The approach shifts the burden of the
                 exponential explosion towards the check whether all
                 program traces are represented, i.e., to a
                 combinatorial problem (over finite graphs).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{DSilva:2013:ACD,
  author =       "Vijay D'Silva and Leopold Haller and Daniel Kroening",
  title =        "Abstract conflict driven learning",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "143--154",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429087",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern satisfiability solvers implement an algorithm,
                 called Conflict Driven Clause Learning, which combines
                 search for a model with analysis of conflicts. We show
                 that this algorithm can be generalised to solve the
                 lattice-theoretic problem of determining if an additive
                 transformer on a Boolean lattice is always bottom. Our
                 generalised procedure combines overapproximations of
                 greatest fixed points with underapproximation of least
                 fixed points to obtain more precise results than
                 computing fixed points in isolation. We generalise
                 implication graphs used in satisfiability solvers to
                 derive underapproximate transformers from
                 overapproximate ones. Our generalisation provides a new
                 method for static analysers that operate over
                 non-distributive lattices to reason about properties
                 that require disjunction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Goyet:2013:LLB,
  author =       "Alexis Goyet",
  title =        "The {Lambda Lambda-Bar} calculus: a dual calculus for
                 unconstrained strategies",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "155--166",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429089",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a calculus which combines a simple,
                 CCS-like representation of finite behaviors, with two
                 dual binders $ \lambda $ and $ {\= \lambda } $.
                 Infinite behaviors are obtained through a syntactical
                 fixed-point operator, which is used to give a
                 translation of $ \lambda $-terms. The duality of the
                 calculus makes the roles of a function and its
                 environment symmetrical. As usual, the environment is
                 allowed to call a function at any given point, each
                 time with a different argument. Dually, the function is
                 allowed to answer any given call, each time with a
                 different behavior. This grants terms in our language
                 the power of functional references. The inspiration for
                 this language comes from game semantics. Indeed, its
                 normal forms give a simple concrete syntax for finite
                 strategies, which are inherently non-innocent. This
                 very direct correspondence allows us to describe, in
                 syntactical terms, a number of features from game
                 semantics. The fixed-point expansion of translated $
                 \lambda $-terms corresponds to the generation of
                 infinite plays from the finite views of an innocent
                 strategy. The syntactical duality between terms and
                 co-terms corresponds to the duality between Player and
                 Opponent. This duality also gives rise to a
                 B{\"o}hm-out lemma. The paper is divided into two
                 parts. The first one is purely syntactical, and
                 requires no background in game semantics. The second
                 describes the fully abstract game model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{lago:2013:GT,
  author =       "Ugo {Dal lago} and Barbara Petit",
  title =        "The geometry of types",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "167--178",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429090",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We show that time complexity analysis of higher-order
                 functional programs can be effectively reduced to an
                 arguably simpler (although computationally equivalent)
                 verification problem, namely checking first-order
                 inequalities for validity. This is done by giving an
                 efficient inference algorithm for linear dependent
                 types which, given a PCF term, produces in output both
                 a linear dependent type and a cost expression for the
                 term, together with a set of proof obligations.
                 Actually, the output type judgement is derivable iff
                 all proof obligations are valid. This, coupled with the
                 already known relative completeness of linear dependent
                 types, ensures that no information is lost, i.e., that
                 there are no false positives or negatives. Moreover,
                 the procedure reflects the difficulty of the original
                 problem: simple PCF terms give rise to sets of proof
                 obligations which are easy to solve. The latter can
                 then be put in a format suitable for automatic or
                 semi-automatic verification by external solvers.
                 Ongoing experimental evaluation has produced
                 encouraging results, which are briefly presented in the
                 paper.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Staton:2013:UPI,
  author =       "Sam Staton and Paul Blain Levy",
  title =        "Universal properties of impure programming languages",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "179--192",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429091",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We investigate impure, call-by-value programming
                 languages. Our first language only has variables and
                 let-binding. Its equational theory is a variant of
                 Lambek's theory of multicategories that omits the
                 commutativity axiom. We demonstrate that type
                 constructions for impure languages --- products, sums
                 and functions --- can be characterized by universal
                 properties in the setting of `premulticategories',
                 multicategories where the commutativity law may fail.
                 This leads us to new, universal characterizations of
                 two earlier equational theories of impure programming
                 languages: the premonoidal categories of Power and
                 Robinson, and the monad-based models of Moggi. Our
                 analysis thus puts these earlier abstract ideas on a
                 canonical foundation, bringing them to a new, syntactic
                 level.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Hur:2013:PPC,
  author =       "Chung-Kil Hur and Georg Neis and Derek Dreyer and
                 Viktor Vafeiadis",
  title =        "The power of parameterization in coinductive proof",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "193--206",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429093",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Coinduction is one of the most basic concepts in
                 computer science. It is therefore surprising that the
                 commonly-known lattice-theoretic accounts of the
                 principles underlying coinductive proofs are lacking in
                 two key respects: they do not support compositional
                 reasoning (i.e. breaking proofs into separate pieces
                 that can be developed in isolation), and they do not
                 support incremental reasoning (i.e. developing proofs
                 interactively by starting from the goal and
                 generalizing the coinduction hypothesis repeatedly as
                 necessary). In this paper, we show how to support
                 coinductive proofs that are both compositional and
                 incremental, using a dead simple construction we call
                 the parameterized greatest fixed point. The basic idea
                 is to parameterize the greatest fixed point of interest
                 over the accumulated knowledge of ``the proof so far''.
                 While this idea has been proposed before, by Winskel in
                 1989 and by Moss in 2001, neither of the previous
                 accounts suggests its general applicability to
                 improving the state of the art in interactive
                 coinductive proof. In addition to presenting the
                 lattice-theoretic foundations of parameterized
                 coinduction, demonstrating its utility on
                 representative examples, and studying its composition
                 with ``up-to'' techniques, we also explore its
                 mechanization in proof assistants like Coq and
                 Isabelle. Unlike traditional approaches to mechanizing
                 coinduction (e.g. Coq's cofix), which employ syntactic
                 ``guardedness checking'', parameterized coinduction
                 offers a semantic account of guardedness. This leads to
                 faster and more robust proof development, as we
                 demonstrate using our new Coq library, Paco.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Delaware:2013:MTC,
  author =       "Benjamin Delaware and Bruno C. d. S. Oliveira and Tom
                 Schrijvers",
  title =        "Meta-theory {\`a} la carte",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "207--218",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429094",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Formalizing meta-theory, or proofs about programming
                 languages, in a proof assistant has many well-known
                 benefits. Unfortunately, the considerable effort
                 involved in mechanizing proofs has prevented it from
                 becoming standard practice. This cost can be amortized
                 by reusing as much of existing mechanized
                 formalizations as possible when building a new language
                 or extending an existing one. One important challenge
                 in achieving reuse is that the inductive definitions
                 and proofs used in these formalizations are closed to
                 extension. This forces language designers to cut and
                 paste existing definitions and proofs in an ad-hoc
                 manner and to expend considerable effort to patch up
                 the results. The key contribution of this paper is the
                 development of an induction technique for extensible
                 Church encodings using a novel reinterpretation of the
                 universal property of folds. These encodings provide
                 the foundation for a framework, formalized in Coq,
                 which uses type classes to automate the composition of
                 proofs from modular components. This framework enables
                 a more structured approach to the reuse of meta-theory
                 formalizations through the composition of modular
                 inductive definitions and proofs. Several interesting
                 language features, including binders and general
                 recursion, illustrate the capabilities of our
                 framework. We reuse these features to build fully
                 mechanized definitions and proofs for a number of
                 languages, including a version of mini-ML. Bounded
                 induction enables proofs of properties for
                 non-inductive semantic functions, and mediating type
                 classes enable proof adaptation for more feature-rich
                 languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Park:2013:TPB,
  author =       "Jonghyun Park and Jeongbong Seo and Sungwoo Park",
  title =        "A theorem prover for {Boolean} {BI}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "219--232",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429095",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "While separation logic is acknowledged as an enabling
                 technology for large-scale program verification, most
                 of the existing verification tools use only a fragment
                 of separation logic that excludes separating
                 implication. As the first step towards a verification
                 tool using full separation logic, we develop a nested
                 sequent calculus for Boolean BI (Bunched Implications),
                 the underlying theory of separation logic, as well as a
                 theorem prover based on it. A salient feature of our
                 nested sequent calculus is that its sequent may have
                 not only smaller child sequents but also multiple
                 parent sequents, thus producing a graph structure of
                 sequents instead of a tree structure. Our theorem
                 prover is based on backward search in a refinement of
                 the nested sequent calculus in which weakening and
                 contraction are built into all the inference rules. We
                 explain the details of designing our theorem prover and
                 provide empirical evidence of its practicality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Krishnamurthi:2013:PPL,
  author =       "Shriram Krishnamurthi",
  title =        "From principles to programming languages (and back)",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "233--234",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429097",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Batty:2013:LAC,
  author =       "Mark Batty and Mike Dodds and Alexey Gotsman",
  title =        "Library abstraction for {C\slash C++} concurrency",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "235--248",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429099",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When constructing complex concurrent systems,
                 abstraction is vital: programmers should be able to
                 reason about concurrent libraries in terms of abstract
                 specifications that hide the implementation details.
                 Relaxed memory models present substantial challenges in
                 this respect, as libraries need not provide
                 sequentially consistent abstractions: to avoid
                 unnecessary synchronisation, they may allow clients to
                 observe relaxed memory effects, and library
                 specifications must capture these. In this paper, we
                 propose a criterion for sound library abstraction in
                 the new C11 and C++11 memory model, generalising the
                 standard sequentially consistent notion of
                 linearizability. We prove that our criterion soundly
                 captures all client-library interactions, both through
                 call and return values, and through the subtle
                 synchronisation effects arising from the memory model.
                 To illustrate our approach, we verify implementations
                 against specifications for the lock-free Treiber stack
                 and a producer-consumer queue. Ours is the first
                 approach to compositional reasoning for concurrent
                 C11/C++11 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Ramalingam:2013:FTI,
  author =       "Ganesan Ramalingam and Kapil Vaswani",
  title =        "Fault tolerance via idempotence",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "249--262",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429100",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Building distributed services and applications is
                 challenging due to the pitfalls of distribution such as
                 process and communication failures. A natural solution
                 to these problems is to detect potential failures, and
                 retry the failed computation and/or resend messages.
                 Ensuring correctness in such an environment requires
                 distributed services and applications to be idempotent.
                 In this paper, we study the inter-related aspects of
                 process failures, duplicate messages, and idempotence.
                 We first introduce a simple core language (based on
                 lambda calculus) inspired by modern distributed
                 computing platforms. This language formalizes the
                 notions of a service, duplicate requests, process
                 failures, data partitioning, and local atomic
                 transactions that are restricted to a single store. We
                 then formalize a desired (generic) correctness
                 criterion for applications written in this language,
                 consisting of idempotence (which captures the desired
                 safety properties) and failure-freedom (which captures
                 the desired progress properties). We then propose
                 language support in the form of a monad that
                 automatically ensures failfree idempotence. A key
                 characteristic of our implementation is that it is
                 decentralized and does not require distributed
                 coordination. We show that the language support can be
                 enriched with other useful constructs, such as
                 compensations, while retaining the coordination-free
                 decentralized nature of the implementation. We have
                 implemented the idempotence monad (and its variants) in
                 F\# and C\# and used our implementation to build
                 realistic applications on Windows Azure. We find that
                 the monad has low runtime overheads and leads to more
                 declarative applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Carbone:2013:DFD,
  author =       "Marco Carbone and Fabrizio Montesi",
  title =        "Deadlock-freedom-by-design: multiparty asynchronous
                 global programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "263--274",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429101",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the last decade, global descriptions have been
                 successfully employed for the verification and
                 implementation of communicating systems, respectively
                 as protocol specifications and choreographies. In this
                 work, we bring these two practices together by
                 proposing a purely-global programming model. We show a
                 novel interpretation of asynchrony and parallelism in a
                 global setting and develop a typing discipline that
                 verifies choreographies against protocol
                 specifications, based on multiparty sessions.
                 Exploiting the nature of global descriptions, our type
                 system defines a new class of deadlock-free concurrent
                 systems (deadlock-freedom-by-design), provides type
                 inference, and supports session mobility. We give a
                 notion of Endpoint Projection (EPP) which generates
                 correct entity code (as pi-calculus terms) from a
                 choreography. Finally, we evaluate our approach by
                 providing a prototype implementation for a concrete
                 programming language and by applying it to some
                 examples from multicore and service-oriented
                 programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Caires:2013:TDB,
  author =       "Lu{\'\i}s Caires and Jo{\~a}o C. Seco",
  title =        "The type discipline of behavioral separation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "275--286",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429103",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce the concept of behavioral separation as a
                 general principle for disciplining interference in
                 higher-order imperative concurrent programs, and
                 present a type-based approach that systematically
                 develops the concept in the context of an ML-like
                 language extended with concurrency and synchronization
                 primitives. Behavioral separation builds on notions
                 originally introduced for behavioral type systems and
                 separation logics, but shifts the focus from the
                 separation of static program state properties towards
                 the separation of dynamic usage behaviors of runtime
                 values. Behavioral separation types specify how values
                 may be safely used by client code, and can enforce
                 fine-grained interference control disciplines while
                 preserving compositionality, information hiding, and
                 flexibility. We illustrate how our type system, even if
                 based on a small set of general primitives, is already
                 able to tackle fairly challenging program idioms,
                 involving aliasing at various types, concurrency with
                 first-class threads, manipulation of linked data
                 structures, behavioral borrowing, and invariant-based
                 separation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Dinsdale-Young:2013:VCR,
  author =       "Thomas Dinsdale-Young and Lars Birkedal and Philippa
                 Gardner and Matthew Parkinson and Hongseok Yang",
  title =        "{Views}: compositional reasoning for concurrent
                 programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "287--300",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429104",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compositional abstractions underly many reasoning
                 principles for concurrent programs: the concurrent
                 environment is abstracted in order to reason about a
                 thread in isolation; and these abstractions are
                 composed to reason about a program consisting of many
                 threads. For instance, separation logic uses formulae
                 that describe part of the state, abstracting the rest;
                 when two threads use disjoint state, their
                 specifications can be composed with the separating
                 conjunction. Type systems abstract the state to the
                 types of variables; threads may be composed when they
                 agree on the types of shared variables. In this paper,
                 we present the ``Concurrent Views Framework'', a
                 metatheory of concurrent reasoning principles. The
                 theory is parameterised by an abstraction of state with
                 a notion of composition, which we call views. The
                 metatheory is remarkably simple, but highly applicable:
                 the rely-guarantee method, concurrent separation logic,
                 concurrent abstract predicates, type systems for
                 recursive references and for unique pointers, and even
                 an adaptation of the Owicki-Gries method can all be
                 seen as instances of the Concurrent Views Framework.
                 Moreover, our metatheory proves each of these systems
                 is sound without requiring induction on the operational
                 semantics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Jensen:2013:HLS,
  author =       "Jonas B. Jensen and Nick Benton and Andrew Kennedy",
  title =        "High-level separation logic for low-level code",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "301--314",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429105",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Separation logic is a powerful tool for reasoning
                 about structured, imperative programs that manipulate
                 pointers. However, its application to unstructured,
                 lower-level languages such as assembly language or
                 machine code remains challenging. In this paper we
                 describe a separation logic tailored for this purpose
                 that we have applied to x86 machine-code programs. The
                 logic is built from an assertion logic on machine
                 states over which we construct a specification logic
                 that encapsulates uses of frames and step indexing. The
                 traditional notion of Hoare triple is not applicable
                 directly to unstructured machine code, where code and
                 data are mixed together and programs do not in general
                 run to completion, so instead we adopt a
                 continuation-passing style of specification with
                 preconditions alone. Nevertheless, the range of
                 primitives provided by the specification logic, which
                 include a higher-order frame connective, a novel
                 read-only frame connective, and a 'later' modality,
                 support the definition of derived forms to support
                 structured-programming-style reasoning for common
                 cases, in which standard rules for Hoare triples are
                 derived as lemmas. Furthermore, our encoding of scoped
                 assembly-language labels lets us give definitions and
                 proof rules for powerful assembly-language 'macros'
                 such as while loops, conditionals and procedures. We
                 have applied the framework to a model of sequential x86
                 machine code built entirely within the Coq proof
                 assistant, including tactic support based on
                 computational reflection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Myers:2013:HLC,
  author =       "Andrew C. Myers",
  title =        "How languages can save distributed computing",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "315--316",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429107",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Henzinger:2013:QRC,
  author =       "Thomas A. Henzinger and Christoph M. Kirsch and Hannes
                 Payer and Ali Sezgin and Ana Sokolova",
  title =        "Quantitative relaxation of concurrent data
                 structures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "317--328",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429109",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There is a trade-off between performance and
                 correctness in implementing concurrent data structures.
                 Better performance may be achieved at the expense of
                 relaxing correctness, by redefining the semantics of
                 data structures. We address such a redefinition of data
                 structure semantics and present a systematic and formal
                 framework for obtaining new data structures by
                 quantitatively relaxing existing ones. We view a data
                 structure as a sequential specification S containing
                 all ``legal'' sequences over an alphabet of method
                 calls. Relaxing the data structure corresponds to
                 defining a distance from any sequence over the alphabet
                 to the sequential specification: the k-relaxed
                 sequential specification contains all sequences over
                 the alphabet within distance k from the original
                 specification. In contrast to other existing work, our
                 relaxations are semantic (distance in terms of data
                 structure states). As an instantiation of our
                 framework, we present two simple yet generic relaxation
                 schemes, called out-of-order and stuttering relaxation,
                 along with several ways of computing distances. We show
                 that the out-of-order relaxation, when further
                 instantiated to stacks, queues, and priority queues,
                 amounts to tolerating bounded out-of-order behavior,
                 which cannot be captured by a purely syntactic
                 relaxation (distance in terms of sequence manipulation,
                 e.g. edit distance). We give concurrent implementations
                 of relaxed data structures and demonstrate that bounded
                 relaxations provide the means for trading correctness
                 for performance in a controlled way. The relaxations
                 are monotonic which further highlights the trade-off:
                 increasing k increases the number of permitted
                 sequences, which as we demonstrate can lead to better
                 performance. Finally, since a relaxed stack or queue
                 also implements a pool, we actually have new concurrent
                 pool implementations that outperform the
                 state-of-the-art ones.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Demange:2013:PBB,
  author =       "Delphine Demange and Vincent Laporte and Lei Zhao and
                 Suresh Jagannathan and David Pichardie and Jan Vitek",
  title =        "{Plan B}: a buffered memory model for {Java}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "329--342",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429110",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent advances in verification have made it possible
                 to envision trusted implementations of real-world
                 languages. Java with its type-safety and fully
                 specified semantics would appear to be an ideal
                 candidate; yet, the complexity of the translation steps
                 used in production virtual machines have made it a
                 challenging target for verifying compiler technology.
                 One of Java's key innovations, its memory model, poses
                 significant obstacles to such an endeavor. The Java
                 Memory Model is an ambitious attempt at specifying the
                 behavior of multithreaded programs in a portable,
                 hardware agnostic, way. While experts have an intuitive
                 grasp of the properties that the model should enjoy,
                 the specification is complex and not well-suited for
                 integration within a verifying compiler infrastructure.
                 Moreover, the specification is given in an axiomatic
                 style that is distant from the intuitive
                 reordering-based reasonings traditionally used to
                 justify or rule out behaviors, and ill suited to the
                 kind of operational reasoning one would expect to
                 employ in a compiler. This paper takes a step back, and
                 introduces a Buffered Memory Model (BMM) for Java. We
                 choose a pragmatic point in the design space
                 sacrificing generality in favor of a model that is
                 fully characterized in terms of the reorderings it
                 allows, amenable to formal reasoning, and which can be
                 efficiently applied to a specific hardware family,
                 namely x86 multiprocessors. Although the BMM restricts
                 the reorderings compilers are allowed to perform, it
                 serves as the key enabling device to achieving a
                 verification pathway from bytecode to machine
                 instructions. Despite its restrictions, we show that it
                 is backwards compatible with the Java Memory Model and
                 that it does not cripple performance on TSO
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Turon:2013:LRF,
  author =       "Aaron J. Turon and Jacob Thamsborg and Amal Ahmed and
                 Lars Birkedal and Derek Dreyer",
  title =        "Logical relations for fine-grained concurrency",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "343--356",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429111",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Fine-grained concurrent data structures (or FCDs)
                 reduce the granularity of critical sections in both
                 time and space, thus making it possible for clients to
                 access different parts of a mutable data structure in
                 parallel. However, the tradeoff is that the
                 implementations of FCDs are very subtle and tricky to
                 reason about directly. Consequently, they are carefully
                 designed to be contextual refinements of their
                 coarse-grained counterparts, meaning that their clients
                 can reason about them as if all access to them were
                 sequentialized. In this paper, we propose a new
                 semantic model, based on Kripke logical relations, that
                 supports direct proofs of contextual refinement in the
                 setting of a type-safe high-level language. The key
                 idea behind our model is to provide a simple way of
                 expressing the ``local life stories'' of individual
                 pieces of an FCD's hidden state by means of protocols
                 that the threads concurrently accessing that state must
                 follow. By endowing these protocols with a simple yet
                 powerful transition structure, as well as the ability
                 to assert invariants on both heap states and
                 specification code, we are able to support clean and
                 intuitive refinement proofs for the most sophisticated
                 types of FCDs, such as conditional compare-and-set
                 (CCAS).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Gaboardi:2013:LDT,
  author =       "Marco Gaboardi and Andreas Haeberlen and Justin Hsu
                 and Arjun Narayan and Benjamin C. Pierce",
  title =        "Linear dependent types for differential privacy",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "357--370",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429113",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Differential privacy offers a way to answer queries
                 about sensitive information while providing strong,
                 provable privacy guarantees, ensuring that the presence
                 or absence of a single individual in the database has a
                 negligible statistical effect on the query's result.
                 Proving that a given query has this property involves
                 establishing a bound on the query's sensitivity---how
                 much its result can change when a single record is
                 added or removed. A variety of tools have been
                 developed for certifying that a given query
                 differentially private. In one approach, Reed and
                 Pierce [34] proposed a functional programming language,
                 Fuzz, for writing differentially private queries. Fuzz
                 uses linear types to track sensitivity and a
                 probability monad to express randomized computation; it
                 guarantees that any program with a certain type is
                 differentially private. Fuzz can successfully verify
                 many useful queries. However, it fails when the
                 sensitivity analysis depends on values that are not
                 known statically. We present DFuzz, an extension of
                 Fuzz with a combination of linear indexed types and
                 lightweight dependent types. This combination allows a
                 richer sensitivity analysis that is able to certify a
                 larger class of queries as differentially private,
                 including ones whose sensitivity depends on runtime
                 information. As in Fuzz, the differential privacy
                 guarantee follows directly from the soundness theorem
                 of the type system. We demonstrate the enhanced
                 expressivity of DFuzz by certifying differential
                 privacy for a broad class of iterative algorithms that
                 could not be typed previously.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Fournet:2013:FAC,
  author =       "Cedric Fournet and Nikhil Swamy and Juan Chen and
                 Pierre-Evariste Dagand and Pierre-Yves Strub and
                 Benjamin Livshits",
  title =        "Fully abstract compilation to {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "371--384",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429114",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many tools allow programmers to develop applications
                 in high-level languages and deploy them in web browsers
                 via compilation to JavaScript. While practical and
                 widely used, these compilers are ad hoc: no guarantee
                 is provided on their correctness for whole programs,
                 nor their security for programs executed within
                 arbitrary JavaScript contexts. This paper presents a
                 compiler with such guarantees. We compile an ML-like
                 language with higher-order functions and references to
                 JavaScript, while preserving all source program
                 properties. Relying on type-based invariants and
                 applicative bisimilarity, we show full abstraction: two
                 programs are equivalent in all source contexts if and
                 only if their wrapped translations are equivalent in
                 all JavaScript contexts. We evaluate our compiler on
                 sample programs, including a series of secure
                 libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Livshits:2013:TFA,
  author =       "Benjamin Livshits and Stephen Chong",
  title =        "Towards fully automatic placement of security
                 sanitizers and declassifiers",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "385--398",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429115",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A great deal of research on sanitizer placement,
                 sanitizer correctness, checking path validity, and
                 policy inference, has been done in the last five to ten
                 years, involving type systems, static analysis and
                 runtime monitoring and enforcement. However, in pretty
                 much all work thus far, the burden of sanitizer
                 placement has fallen on the developer. However,
                 sanitizer placement in large-scale applications is
                 difficult, and developers are likely to make errors,
                 and thus create security vulnerabilities. This paper
                 advocates a radically different approach: we aim to
                 fully automate the placement of sanitizers by analyzing
                 the ow of tainted data in the program. We argue that
                 developers are better off leaving out sanitizers
                 entirely instead of trying to place them. This paper
                 proposes a fully automatic technique for sanitizer
                 placement. Placement is static whenever possible,
                 switching to run time when necessary. Run-time taint
                 tracking techniques can be used to track the source of
                 a value, and thus apply appropriate sanitization.
                 However, due to the runtime overhead of run-time taint
                 tracking, our technique avoids it wherever possible.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Goodman:2013:PPP,
  author =       "Noah D. Goodman",
  title =        "The principles and practice of probabilistic
                 programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "399--402",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429117",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Gordon:2013:MLP,
  author =       "Andrew D. Gordon and Mihhail Aizatulin and Johannes
                 Borgstrom and Guillaume Claret and Thore Graepel and
                 Aditya V. Nori and Sriram K. Rajamani and Claudio
                 Russo",
  title =        "A model-learner pattern for {Bayesian} reasoning",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "403--416",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429119",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A Bayesian model is based on a pair of probability
                 distributions, known as the prior and sampling
                 distributions. A wide range of fundamental machine
                 learning tasks, including regression, classification,
                 clustering, and many others, can all be seen as
                 Bayesian models. We propose a new probabilistic
                 programming abstraction, a typed Bayesian model, which
                 is based on a pair of probabilistic expressions for the
                 prior and sampling distributions. A sampler for a model
                 is an algorithm to compute synthetic data from its
                 sampling distribution, while a learner for a model is
                 an algorithm for probabilistic inference on the model.
                 Models, samplers, and learners form a generic
                 programming pattern for model-based inference. They
                 support the uniform expression of common tasks
                 including model testing, and generic compositions such
                 as mixture models, evidence-based model averaging, and
                 mixtures of experts. A formal semantics supports
                 reasoning about model equivalence and implementation
                 correctness. By developing a series of examples and
                 three learner implementations based on exact inference,
                 factor graphs, and Markov chain Monte Carlo, we
                 demonstrate the broad applicability of this new
                 programming pattern.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Suenaga:2013:HPS,
  author =       "Kohei Suenaga and Hiroyoshi Sekine and Ichiro Hasuo",
  title =        "Hyperstream processing systems: nonstandard modeling
                 of continuous-time signals",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "417--430",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429120",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We exploit the apparent similarity between
                 (discrete-time) stream processing and (continuous-time)
                 signal processing and transfer a deductive verification
                 framework from the former to the latter. Our
                 development is based on rigorous semantics that relies
                 on nonstandard analysis (NSA). Specifically, we start
                 with a discrete framework consisting of a Lustre-like
                 stream processing language, its Kahn-style fixed point
                 semantics, and a program logic (in the form of a type
                 system) for partial correctness guarantees. This stream
                 framework is transferred as it is to one for
                 hyperstreams---streams of streams, that typically arise
                 from sampling (continuous-time) signals with
                 progressively smaller intervals---via the logical
                 infrastructure of NSA. Under a certain continuity
                 assumption we identify hyperstreams with signals; our
                 final outcome thus obtained is a deductive verification
                 framework of signals. In it one verifies properties of
                 signals using the (conventionally discrete) proof
                 principles, like fixed point induction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Vytiniotis:2013:HHL,
  author =       "Dimitrios Vytiniotis and Simon Peyton Jones and Koen
                 Claessen and Dan Ros{\'e}n",
  title =        "{HALO}: {Haskell} to logic through denotational
                 semantics",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "431--442",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429121",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Even well-typed programs can go wrong in modern
                 functional languages, by encountering a pattern-match
                 failure, or simply returning the wrong answer. An
                 increasingly-popular response is to allow programmers
                 to write contracts that express semantic properties,
                 such as crash-freedom or some useful post-condition. We
                 study the static verification of such contracts. Our
                 main contribution is a novel translation to first-order
                 logic of both Haskell programs, and contracts written
                 in Haskell, all justified by denotational semantics.
                 This translation enables us to prove that functions
                 satisfy their contracts using an off-the-shelf
                 first-order logic theorem prover.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Botincan:2013:SSL,
  author =       "Matko Botincan and Domagoj Babi{\'c}",
  title =        "{Sigma*}: symbolic learning of input-output
                 specifications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "443--456",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429123",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Sigma*, a novel technique for learning
                 symbolic models of software behavior. Sigma* addresses
                 the challenge of synthesizing models of software by
                 using symbolic conjectures and abstraction. By
                 combining dynamic symbolic execution to discover
                 symbolic input-output steps of the programs and
                 counterexample guided abstraction refinement to
                 over-approximate program behavior, Sigma* transforms
                 arbitrary source representation of programs into
                 faithful input-output models. We define a class of
                 stream filters---programs that process streams of data
                 items---for which Sigma* converges to a complete model
                 if abstraction refinement eventually builds up a
                 sufficiently strong abstraction. In other words, Sigma*
                 is complete relative to abstraction. To represent
                 inferred symbolic models, we use a variant of symbolic
                 transducers that can be effectively composed and
                 equivalence checked. Thus, Sigma* enables fully
                 automatic analysis of behavioral properties such as
                 commutativity, reversibility and idempotence, which is
                 useful for web sanitizer verification and stream
                 programs compiler optimizations, as we show
                 experimentally. We also show how models inferred by
                 Sigma* can boost performance of stream programs by
                 parallelized code generation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Bonchi:2013:CNE,
  author =       "Filippo Bonchi and Damien Pous",
  title =        "Checking {NFA} equivalence with bisimulations up to
                 congruence",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "457--468",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429124",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce bisimulation up to congruence as a
                 technique for proving language equivalence of
                 non-deterministic finite automata. Exploiting this
                 technique, we devise an optimisation of the classical
                 algorithm by Hopcroft and Karp. We compare our approach
                 to the recently introduced antichain algorithms, by
                 analysing and relating the two underlying coinductive
                 proof methods. We give concrete examples where we
                 exponentially improve over antichains; experimental
                 results moreover show non negligible improvements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Koksal:2013:SBM,
  author =       "Ali Sinan Koksal and Yewen Pu and Saurabh Srivastava
                 and Rastislav Bodik and Jasmin Fisher and Nir
                 Piterman",
  title =        "Synthesis of biological models from mutation
                 experiments",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "469--482",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429125",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Executable biology presents new challenges to formal
                 methods. This paper addresses two problems that cell
                 biologists face when developing formally analyzable
                 models. First, we show how to automatically synthesize
                 a concurrent in-silico model for cell development given
                 in-vivo experiments of how particular mutations
                 influence the experiment outcome. The problem of
                 synthesis under mutations is unique because mutations
                 may produce non-deterministic outcomes (presumably by
                 introducing races between competing signaling pathways
                 in the cells) and the synthesized model must be able to
                 replay all these outcomes in order to faithfully
                 describe the modeled cellular processes. In contrast, a
                 ``regular'' concurrent program is correct if it picks
                 any outcome allowed by the non-deterministic
                 specification. We developed synthesis algorithms and
                 synthesized a model of cell fate determination of the
                 earthworm {\em C. elegans}. A version of this model
                 previously took systems biologists months to develop.
                 Second, we address the problem of under-constrained
                 specifications that arise due to incomplete sets of
                 mutation experiments. Under-constrained specifications
                 give rise to distinct models, each explaining the same
                 phenomenon differently. Addressing the ambiguity of
                 specifications corresponds to analyzing the space of
                 plausible models. We develop algorithms for detecting
                 ambiguity in specifications, i.e., whether there exist
                 alternative models that would produce different fates
                 on some unperformed experiment, and for removing
                 redundancy from specifications, i.e., computing minimal
                 non-ambiguous specifications. Additionally, we develop
                 a modeling language and embed it into Scala. We
                 describe how this language design and embedding allows
                 us to build an efficient synthesizer. For our {\em C.
                 elegans\/} case study, we infer two observationally
                 equivalent models expressing different biological
                 hypotheses through different protein interactions. One
                 of these hypotheses was previously unknown to
                 biologists.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Upadrasta:2013:SPS,
  author =       "Ramakrishna Upadrasta and Albert Cohen",
  title =        "Sub-polyhedral scheduling using
                 (unit-)two-variable-per-inequality polyhedra",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "483--496",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429127",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Polyhedral compilation has been successful in the
                 design and implementation of complex loop nest
                 optimizers and parallelizing compilers. The algorithmic
                 complexity and scalability limitations remain one
                 important weakness. We address it using sub-polyhedral
                 under-approximations of the systems of constraints
                 resulting from affine scheduling problems. We propose a
                 sub-polyhedral scheduling technique using
                 (Unit-)Two-Variable-Per-Inequality or (U)TVPI
                 Polyhedra. This technique relies on simple polynomial
                 time algorithms to under-approximate a general
                 polyhedron into (U)TVPI polyhedra. We modify the
                 state-of-the-art PLuTo compiler using our scheduling
                 technique, and show that for a majority of the
                 Polybench (2.0) kernels, the above under-approximations
                 yield polyhedra that are non-empty. Solving the
                 under-approximated system leads to asymptotic gains in
                 complexity, and shows practically significant
                 improvements when compared to a traditional LP solver.
                 We also verify that code generated by our
                 sub-polyhedral parallelization prototype matches the
                 performance of PLuTo-optimized code when the
                 under-approximation preserves feasibility.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Rompf:2013:ODS,
  author =       "Tiark Rompf and Arvind K. Sujeeth and Nada Amin and
                 Kevin J. Brown and Vojin Jovanovic and HyoukJoong Lee
                 and Manohar Jonnalagedda and Kunle Olukotun and Martin
                 Odersky",
  title =        "Optimizing data structures in high-level programs: new
                 directions for extensible compilers based on staging",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "497--510",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429128",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "High level data structures are a cornerstone of modern
                 programming and at the same time stand in the way of
                 compiler optimizations. In order to reason about user-
                 or library-defined data structures compilers need to be
                 extensible. Common mechanisms to extend compilers fall
                 into two categories. Frontend macros, staging or
                 partial evaluation systems can be used to
                 programmatically remove abstraction and specialize
                 programs before they enter the compiler. Alternatively,
                 some compilers allow extending the internal workings by
                 adding new transformation passes at different points in
                 the compile chain or adding new intermediate
                 representation (IR) types. None of these mechanisms
                 alone is sufficient to handle the challenges posed by
                 high level data structures. This paper shows a novel
                 way to combine them to yield benefits that are greater
                 than the sum of the parts. Instead of using staging
                 merely as a front end, we implement internal compiler
                 passes using staging as well. These internal passes
                 delegate back to program execution to construct the
                 transformed IR. Staging is known to simplify program
                 generation, and in the same way it can simplify program
                 transformation. Defining a transformation as a staged
                 IR interpreter is simpler than implementing a low-level
                 IR to IR transformer. With custom IR nodes, many
                 optimizations that are expressed as rewritings from IR
                 nodes to staged program fragments can be combined into
                 a single pass, mitigating phase ordering problems.
                 Speculative rewriting can preserve optimistic
                 assumptions around loops. We demonstrate several
                 powerful program optimizations using this architecture
                 that are particularly geared towards data structures: a
                 novel loop fusion and deforestation algorithm, array of
                 struct to struct of array conversion, object flattening
                 and code generation for heterogeneous parallel devices.
                 We validate our approach using several non trivial case
                 studies that exhibit order of magnitude speedups in
                 experiments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Adams:2013:PPI,
  author =       "Michael D. Adams",
  title =        "Principled parsing for indentation-sensitive
                 languages: revisiting {Landin}'s offside rule",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "511--522",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429129",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Several popular languages, such as Haskell, Python,
                 and F\#, use the indentation and layout of code as part
                 of their syntax. Because context-free grammars cannot
                 express the rules of indentation, parsers for these
                 languages currently use ad hoc techniques to handle
                 layout. These techniques tend to be low-level and
                 operational in nature and forgo the advantages of more
                 declarative specifications like context-free grammars.
                 For example, they are often coded by hand instead of
                 being generated by a parser generator. This paper
                 presents a simple extension to context-free grammars
                 that can express these layout rules, and derives GLR
                 and LR(k) algorithms for parsing these grammars. These
                 grammars are easy to write and can be parsed
                 efficiently. Examples for several languages are
                 presented, as are benchmarks showing the practical
                 efficiency of these algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Hobor:2013:RSD,
  author =       "Aquinas Hobor and Jules Villard",
  title =        "The ramifications of sharing in data structures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "523--536",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429131",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programs manipulating mutable data structures with
                 intrinsic sharing present a challenge for modular
                 verification. Deep aliasing inside data structures
                 dramatically complicates reasoning in isolation over
                 parts of these objects because changes to one part of
                 the structure (say, the left child of a dag node) can
                 affect other parts (the right child or some of its
                 descendants) that may point into it. The result is that
                 finding intuitive and compositional proofs of
                 correctness is usually a struggle. We propose a
                 compositional proof system that enables local reasoning
                 in the presence of sharing. While the AI ``frame
                 problem'' elegantly captures the reasoning required to
                 verify programs without sharing, we contend that
                 natural reasoning about programs with sharing instead
                 requires an answer to a different and more challenging
                 AI problem, the ``ramification problem'': reasoning
                 about the indirect consequences of actions.
                 Accordingly, we present a RAMIFY proof rule that
                 attacks the ramification problem head-on and show how
                 to reason with it. Our framework is valid in any
                 separation logic and permits sound compositional and
                 local reasoning in the context of both specified and
                 unspecified sharing. We verify the correctness of a
                 number of examples, including programs that manipulate
                 dags, graphs, and overlaid data structures in
                 nontrivial ways.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Totla:2013:CIB,
  author =       "Nishant Totla and Thomas Wies",
  title =        "Complete instantiation-based interpolation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "537--548",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429132",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Craig interpolation has been a valuable tool for
                 formal methods with interesting applications in program
                 analysis and verification. Modern SMT solvers implement
                 interpolation procedures for the theories that are most
                 commonly used in these applications. However, many
                 application-specific theories remain unsupported, which
                 limits the class of problems to which
                 interpolation-based techniques apply. In this paper, we
                 present a generic framework to build new interpolation
                 procedures via reduction to existing interpolation
                 procedures. We consider the case where an
                 application-specific theory can be formalized as an
                 extension of a base theory with additional symbols and
                 axioms. Our technique uses finite instantiation of the
                 extension axioms to reduce an interpolation problem in
                 the theory extension to one in the base theory. We
                 identify a model-theoretic criterion that allows us to
                 detect the cases where our technique is complete. We
                 discuss specific theories that are relevant in program
                 verification and that satisfy this criterion. In
                 particular, we obtain complete interpolation procedures
                 for theories of arrays and linked lists. The latter is
                 the first complete interpolation procedure for a theory
                 that supports reasoning about complex shape properties
                 of heap-allocated data structures. We have implemented
                 this procedure in a prototype on top of existing SMT
                 solvers and used it to automatically infer loop
                 invariants of list-manipulating programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Barr:2013:ADF,
  author =       "Earl T. Barr and Thanh Vo and Vu Le and Zhendong Su",
  title =        "Automatic detection of floating-point exceptions",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "549--560",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429133",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is well-known that floating-point exceptions can be
                 disastrous and writing exception-free numerical
                 programs is very difficult. Thus, it is important to
                 automatically detect such errors. In this paper, we
                 present Ariadne, a practical symbolic execution system
                 specifically designed and implemented for detecting
                 floating-point exceptions. Ariadne systematically
                 transforms a numerical program to explicitly check each
                 exception triggering condition. Ariadne symbolically
                 executes the transformed program using real arithmetic
                 to find candidate real-valued inputs that can reach and
                 trigger an exception. Ariadne converts each candidate
                 input into a floating-point number, then tests it
                 against the original program. In general, approximating
                 floating-point arithmetic with real arithmetic can
                 change paths from feasible to infeasible and vice
                 versa. The key insight of this work is that, for the
                 problem of detecting floating-point exceptions, this
                 approximation works well in practice because, if one
                 input reaches an exception, many are likely to, and at
                 least one of them will do so over both floating-point
                 and real arithmetic. To realize Ariadne, we also
                 devised a novel, practical linearization technique to
                 solve nonlinear constraints. We extensively evaluated
                 Ariadne over 467 scalar functions in the widely used
                 GNU Scientific Library (GSL). Our results show that
                 Ariadne is practical and identifies a large number of
                 real runtime exceptions in GSL. The GSL developers
                 confirmed our preliminary findings and look forward to
                 Ariadne's public release, which we plan to do in the
                 near future.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Ley-Wild:2013:SAS,
  author =       "Ruy Ley-Wild and Aleksandar Nanevski",
  title =        "Subjective auxiliary state for coarse-grained
                 concurrency",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "1",
  pages =        "561--574",
  month =        jan,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480359.2429134",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:03 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "From Owicki-Gries' Resource Invariants and Jones'
                 Rely/Guarantee to modern variants based on Separation
                 Logic, axiomatic logics for concurrency require
                 auxiliary state to explicitly relate the effect of all
                 threads to the global invariant on the shared resource.
                 Unfortunately, auxiliary state gives the proof of an
                 individual thread access to the auxiliaries of all
                 other threads. This makes proofs sensitive to the
                 global context, which prevents local reasoning and
                 compositionality. To tame this historical difficulty of
                 auxiliary state, we propose subjective auxiliary state,
                 whereby each thread is verified using a self view
                 (i.e., the thread's effect on the shared resource) and
                 an other view (i.e., the collective effect of all the
                 other threads). Subjectivity generalizes auxiliary
                 state from stacks and heaps to user-chosen partial
                 commutative monoids, which can eliminate the dependence
                 on the global thread structure. We employ subjectivity
                 to formulate Subjective Concurrent Separation Logic as
                 a combination of subjective auxiliary state and
                 Concurrent Separation Logic. The logic yields simple,
                 compositional proofs of coarse-grained concurrent
                 programs that use auxiliary state, and scales to
                 support higher-order recursive procedures that can
                 themselves fork new threads. We prove the soundness of
                 the logic with a novel denotational semantics of action
                 trees and a definition of safety using rely/guarantee
                 transitions over a large subjective footprint. We have
                 mechanized the denotational semantics, logic,
                 metatheory, and a number of examples by a shallow
                 embedding in Coq.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '13 conference proceedings.",
}

@Article{Miller:2013:TSG,
  author =       "Mark Miller",
  title =        "A tested semantics for getters, setters, and eval in
                 {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "1--16",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384579",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present S5, a semantics for the strict mode of the
                 ECMAScript 5.1 (JavaScript) programming language. S5
                 shrinks the large source language into a manageable
                 core through an implemented transformation. The
                 resulting specification has been tested against
                 real-world conformance suites for the language. This
                 paper focuses on two aspects of S5: accessors (getters
                 and setters) and eval. Since these features are complex
                 and subtle in JavaScript, they warrant special study.
                 Variations on both features are found in several other
                 programming languages, so their study is likely to have
                 broad applicability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Homer:2013:POG,
  author =       "Michael Homer and James Noble and Kim B. Bruce and
                 Andrew P. Black and David J. Pearce",
  title =        "Patterns as objects in {Grace}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "17--28",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384581",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Object orientation and pattern matching are often seen
                 as conflicting approaches to program design.
                 Object-oriented programs place type-dependent behavior
                 inside objects and invoke it via dynamic dispatch,
                 while pattern-matching programs place type-dependent
                 behavior outside data structures and invoke it via
                 multiway conditionals (case statements). Grace is a
                 new, dynamic, object-oriented language designed to
                 support teaching: to this end, Grace needs to support
                 both styles. We explain how this conflict can be
                 resolved gracefully: by modelling patterns and cases as
                 partial functions, reifying those functions as objects,
                 and then building up complex patterns from simpler ones
                 using pattern combinators. We describe the
                 implementation of this design as an object-oriented
                 framework, and a case study of its effectiveness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Bloom:2013:RSP,
  author =       "Bard Bloom and Martin J. Hirzel",
  title =        "Robust scripting via patterns",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "29--40",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384582",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic typing in scripting languages is a two-edged
                 sword. On the one hand, it can be more flexible and
                 more concise than static typing. On the other hand, it
                 can lead to less robust code. We argue that patterns
                 can give scripts much of the robustness of static
                 typing, without losing the flexibility and concision of
                 dynamic typing. To make this case, we describe a rich
                 pattern system in the dynamic language Thorn. Thorn
                 patterns interact with its control constructs and
                 scoping rules to support concise and robust
                 test-and-extract idioms. Thorn patterns encompass an
                 extensive set of features from ML-style patterns to
                 regular expressions and beyond. And Thorn patterns can
                 be first-class and support pattern-punning (mirror
                 constructor syntax). Overall, this paper describes a
                 powerful pattern system that makes scripting more
                 robust.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Normark:2013:OOP,
  author =       "Kurt N{\o}rmark and Lone Leth Thomsen and Bent
                 Thomsen",
  title =        "Object-oriented programming with gradual abstraction",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "41--52",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384583",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe an experimental object-oriented
                 programming language, ASL2, that supports program
                 development by means of a series of abstraction steps.
                 The language allows immediate object construction, and
                 it is possible to use the constructed objects for
                 concrete problem solving tasks. Classes and class
                 hierarchies can be derived from the objects --- via
                 gradual abstraction steps. We introduce two levels of
                 object classification, called weak and strong object
                 classification. Strong object classification relies on
                 conventional classes, whereas weak object
                 classification is looser, and less restrictive. As a
                 central mechanism, weakly classified objects are
                 allowed to borrow methods from each other. ASL2
                 supports class generalization, as a counterpart to
                 class specialization and inheritance in mainstream
                 object-oriented programming languages. The final
                 abstraction step discussed in this paper is a
                 syntactical abstraction step that derives a source file
                 with a syntactical class form.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Pignotti:2013:ADP,
  author =       "Alessandro Pignotti and Adam Welc and Bernd Mathiske",
  title =        "Adaptive data parallelism for {Internet} clients on
                 heterogeneous platforms",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "53--62",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384585",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today's Internet is long past static web pages filled
                 with HTML-formatted text sprinkled with an occasional
                 image or animation. We have entered an era of Rich
                 Internet Applications executed locally on Internet
                 clients such as web browsers: games, physics engines,
                 image rendering, photo editing, etc. Yet today's
                 languages used to program Internet clients have limited
                 ability to tap to the computational capabilities of the
                 underlying, often heterogeneous, platforms. In this
                 paper we present how a Domain Specific Language(DSL)
                 can be integrated into ActionScript, one of the most
                 popular scripting languages used to program Internet
                 clients and a close cousin of JavaScript. We
                 demonstrate how our DSL, called ASDP (ActionScript Data
                 Parallel), can be used to enable data parallelism for
                 existing sequential programs. We also present a
                 prototype of a system where data parallel workloads can
                 be executed on either CPU or a GPU, with the runtime
                 system transparently selecting the best processing
                 unit, depending on the type of workload as well as the
                 architecture and current load of the execution
                 platform. We evaluate performance of our system on a
                 variety of benchmarks, representing different types of
                 workloads: physics, image processing, scientific
                 computing and cryptography.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Ardo:2013:LAO,
  author =       "H{\aa}kan Ard{\"o} and Carl Friedrich Bolz and Maciej
                 Fija{\l}kowski",
  title =        "Loop-aware optimizations in {PyPy}'s tracing {JIT}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "63--72",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384586",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "One of the nice properties of a tracing just-in-time
                 compiler (JIT) is that many of its optimizations are
                 simple, requiring one forward pass only. This is not
                 true for loop-invariant code motion which is a very
                 important optimization for code with tight kernels.
                 Especially for dynamic languages that typically perform
                 quite a lot of loop invariant type checking, boxed
                 value unwrapping and virtual method lookups. In this
                 paper we explain a scheme pioneered within the context
                 of the LuaJIT project for making basic optimizations
                 loop-aware by using a simple pre-processing step on the
                 trace without changing the optimizations themselves. We
                 have implemented the scheme in RPython's tracing JIT
                 compiler. PyPy's Python JIT executing simple numerical
                 kernels can become up to two times faster, bringing the
                 performance into the ballpark of static language
                 compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Wurthinger:2013:SOA,
  author =       "Thomas W{\"u}rthinger and Andreas W{\"o}{\ss} and
                 Lukas Stadler and Gilles Duboscq and Doug Simon and
                 Christian Wimmer",
  title =        "Self-optimizing {AST} interpreters",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "73--82",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384587",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "An abstract syntax tree (AST) interpreter is a simple
                 and natural way to implement a programming language.
                 However, it is also considered the slowest approach
                 because of the high overhead of virtual method
                 dispatch. Language implementers therefore define
                 bytecodes to speed up interpretation, at the cost of
                 introducing inflexible and hard to maintain bytecode
                 formats. We present a novel approach to implementing
                 AST interpreters in which the AST is modified during
                 interpretation to incorporate type feedback. This tree
                 rewriting is a general and powerful mechanism to
                 optimize many constructs common in dynamic programming
                 languages. Our system is implemented in Java and uses
                 the static typing and primitive data types of Java
                 elegantly to avoid the cost of boxed representations of
                 primitive values in dynamic programming languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Wernli:2013:OFC,
  author =       "Erwann Wernli and Pascal Maerki and Oscar Nierstrasz",
  title =        "Ownership, filters and crossing handlers: flexible
                 ownership in dynamic languages",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "83--94",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384589",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Sharing mutable objects can result in broken
                 invariants, exposure of internal details, and other
                 subtle bugs. To prevent such issues, it is important to
                 control accessibility and aliasing of objects. Dynamic
                 Ownership is an effective way to do so, but its
                 owner-as-dominator discipline is too restrictive:
                 objects are either accessible or not. We propose in
                 this paper to control accessibility and aliasing with
                 more flexibility using two mechanisms, filters and
                 crossing handlers. We demonstrate the benefits of the
                 flexibility offered by these mechanisms, and report on
                 the adaptation of a Smalltalk web server with our
                 approach. We conclude that our variant of dynamic
                 ownership is flexible enough to accommodate an existing
                 design, while at the same time constraining it enough
                 to highlight design anomalies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Lerner:2013:DCA,
  author =       "Benjamin S. Lerner and Dan Grossman",
  title =        "Detecting conflicts among declarative {UI}
                 extensions",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "95--106",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384590",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We examine overlays, a flexible aspect-like mechanism
                 for third-party declarative extensions of declarative
                 UIs. Overlays can be defined for any markup language
                 and permit extensions to define new content that is
                 dynamically woven into a base UI document. While
                 powerful, overlays are inherently non-modular and may
                 conflict with each other, by defining duplicate or
                 contradictory UI components. We construct an abstract
                 language to capture core overlay semantics, and design
                 an automatic analysis to detect inter-extension
                 conflicts. We apply the analysis to a case study of
                 Firefox extensions, finding several real-world bugs.
                 Our analysis provides low-level feedback to extension
                 developers and high-level reports to end users.
                 Finally, we show how variants of overlays more
                 expressive than those of Firefox complicate conflict
                 detection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Steinert:2013:COA,
  author =       "Bastian Steinert and Damien Cassou and Robert
                 Hirschfeld",
  title =        "{CoExist}: overcoming aversion to change",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "2",
  pages =        "107--118",
  month =        feb,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480360.2384591",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmers make many changes to the program to
                 eventually find a good solution for a given task. In
                 this course of change, every intermediate development
                 state can of value, when, for example, a promising
                 ideas suddenly turn out inappropriate or the interplay
                 of objects turns out more complex than initially
                 expected before making changes. Programmers would
                 benefit from tool support that provides immediate
                 access to source code and run-time of previous
                 development states of interest. We present IDE
                 extensions, implemented for Squeak/Smalltalk, to
                 preserve, retrieve, and work with this information.
                 With such tool support, programmers can work without
                 worries because they can rely on tools that help them
                 with whatever their explorations will reveal. They no
                 longer have to follow certain best practices only to
                 avoid undesired consequences of hanging code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '12 conference proceedings.",
}

@Article{Damiani:2013:FFD,
  author =       "Ferruccio Damiani and Luca Padovani and Ina Schaefer",
  title =        "A formal foundation for dynamic delta-oriented
                 software product lines",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "1--10",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371403",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Delta-oriented programming (DOP) is a flexible
                 approach for implementing software product lines
                 (SPLs). DOP SPLs are implemented by a code base (a set
                 of delta modules encapsulating changes to
                 object-oriented programs) and a product line
                 declaration (providing the connection of the delta
                 modules with the product features). In this paper, we
                 extend DOP by the capability to switch the implemented
                 product configuration at runtime and present a formal
                 foundation for dynamic DOP. A dynamic DOP SPL is a DOP
                 SPL with a dynamic reconfiguration graph that specifies
                 how to switch between different feature configurations.
                 Dynamic DOP supports (unanticipated) software evolution
                 such that at runtime, the product line declaration, the
                 code base and the dynamic reconfiguration graph can be
                 changed in any (unanticipated) way that preserves the
                 currently running product. The type system of our
                 dynamic DOP core calculus ensures that the dynamic
                 reconfigurations lead to type safe products and do not
                 cause runtime type errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Thum:2013:FBD,
  author =       "Thomas Th{\"u}m and Ina Schaefer and Sven Apel and
                 Martin Hentschel",
  title =        "Family-based deductive verification of software
                 product lines",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "11--20",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371404",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A software product line is a set of similar software
                 products that share a common code base. While software
                 product lines can be implemented efficiently using
                 feature-oriented programming, verifying each product
                 individually does not scale, especially if human effort
                 is required (e.g., as in interactive theorem proving).
                 We present a family-based approach of deductive
                 verification to prove the correctness of a software
                 product line efficiently. We illustrate and evaluate
                 our approach for software product lines written in a
                 feature-oriented dialect of Java and specified using
                 the Java Modeling Language. We show that the theorem
                 prover KeY can be used off-the-shelf for this task,
                 without any modifications. Compared to the individual
                 verification of each product, our approach reduces the
                 verification time needed for our case study by more
                 than 85\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Ryssel:2013:RFM,
  author =       "Uwe Ryssel and Joern Ploennigs and Klaus Kabitzsch",
  title =        "Reasoning of feature models from derived features",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "21--30",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371405",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When using product lines, whose variability models are
                 based on derived features, e.g., Simulink variant
                 objects, the dependencies among the features are only
                 described implicitly. This makes it difficult to verify
                 the mapping of the features to the solution space and
                 to create a comprehensive overview of the feature
                 dependencies like in a feature model. In this paper, an
                 OWL-based approach is presented, which permits the
                 automatic verification of the feature mapping and an
                 automatic feature model synthesis for derived features
                 using OWL reasoning and formal concept analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Rayside:2013:SIA,
  author =       "Derek Rayside and Vajihollah Montaghami and Francesca
                 Leung and Albert Yuen and Kevin Xu and Daniel Jackson",
  title =        "Synthesizing iterators from abstraction functions",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "31--40",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371407",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A technique for synthesizing iterators from
                 declarative abstraction functions written in a
                 relational logic specification language is described.
                 The logic includes a transitive closure operator that
                 makes it convenient for expressing reachability queries
                 on linked data structures. Some optimizations,
                 including tuple elimination, iterator flattening, and
                 traversal state reduction, are used to improve
                 performance of the generated iterators. A case study
                 demonstrates that most of the iterators in the widely
                 used JDK Collections classes can be replaced with code
                 synthesized from declarative abstraction functions.
                 These synthesized iterators perform competitively with
                 the hand-written originals. In a user study the
                 synthesized iterators always passed more test cases
                 than the hand-written ones, were almost always as
                 efficient, usually took less programmer effort, and
                 were the qualitative preference of all participants who
                 provided free-form comments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Hulette:2013:CTT,
  author =       "Geoffrey C. Hulette and Matthew Sottile and Allen D.
                 Malony",
  title =        "Composing typemaps in {Twig}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "41--49",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371408",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Twig is a language for writing typemaps, programs
                 which transform the type of a value while preserving
                 its underlying meaning. Typemaps are typically used by
                 tools that generate code, such as multi-language
                 wrapper generators, to automatically convert types as
                 needed. Twig builds on existing typemap tools in a few
                 key ways. Twig's typemaps are composable so that
                 complex transformations may be built from simpler ones.
                 In addition, Twig incorporates an abstract, formal
                 model of code generation, allowing it to output code
                 for different target languages. We describe Twig's
                 formal semantics and show how the language allows us to
                 concisely express typemaps. Then, we demonstrate Twig's
                 utility by building an example typemap.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Axelsen:2013:PTD,
  author =       "Eyvind W. Axelsen and Stein Krogdahl",
  title =        "{Package Templates}: a definition by
                 semantics-preserving source-to-source transformations
                 to efficient {Java} code",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "50--59",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371409",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Package Templates (PT) is a mechanism designed for
                 writing reusable modules, called templates, each
                 consisting of a set of classes that can be adapted to
                 their use in a program through compile-time
                 specialization. A template must be instantiated in a
                 program before its classes can be used. The mechanism
                 supports type-safe renaming, merging, type
                 parameterization and refinement in the form of static
                 additions and overrides that are orthogonal to the
                 corresponding concepts of ordinary inheritance. In this
                 paper, we consider PT as an extension to Java, and a PT
                 program will then consist of a number of Java packages
                 and templates, where templates are instantiated in
                 packages or other templates. Our aim and main
                 contribution is to define the meaning of such a
                 program, and to show that this definition is
                 consistent. We first show this for a core subset of PT,
                 C-PT, and define a set of source-to-source
                 transformations for converting C-PT programs to plain
                 Java programs using semantics we have described
                 informally in previous papers. We can then define the
                 meaning of a C-PT program in terms of the resulting
                 Java program. Thus, we have to verify that the
                 transformations will always convert a legal C-PT
                 program to a legal Java program. Finally, we briefly
                 discuss how this approach can be extended to full PT. A
                 main challenge is to preserve externally visible names
                 (for classes, methods and fields), and at the same time
                 prevent unwanted subsequent rebindings caused e.g. by
                 over-load resolution in the Java compiler. Names that
                 are bound to declarations in a template should not be
                 rebound to different declarations by subsequent
                 compositions or adaptions. In addition to defining the
                 runtime semantics of PT constructs in terms of their
                 translation to Java, the transformation rules can also
                 be seen as a high-level approach to how a compiler for
                 this language might be implemented.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Spacek:2013:ISS,
  author =       "Petr Spacek and Christophe Dony and Chouki Tibermacine
                 and Luc Fabresse",
  title =        "An inheritance system for structural \& behavioral
                 reuse in component-based software programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "60--69",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371411",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the context of Component-based Programming, which
                 addresses the implementation stage of a component-based
                 software engineering development process, this paper
                 describes a specification and an operational
                 integration of an inheritance system into a
                 self-contained new component-based programming language
                 named Compo. Our proposal completes and extends related
                 works by making it possible to apply inheritance to the
                 full description of components, i.e. both to structural
                 (description of provisions and requirements, of
                 component architecture) and behavioral (full
                 implementations of services) parts in component
                 descriptions. Inheritance in Compo is designed to be
                 used in conjunction with composition to maximize reuse
                 capabilities and expressive power. Compo implementation
                 proposes a clear operational solution for inheritance
                 and for achieving and testing substitutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Zhang:2013:TLC,
  author =       "Huaxi (Yulin) Zhang and Lei Zhang and Christelle
                 Urtado and Sylvain Vauttier and Marianne Huchard",
  title =        "A three-level component model in component based
                 software development",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "70--79",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371412",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Component-based development promotes a software
                 development process that focuses on component reuse.
                 How to describe a desired component before searching in
                 the repository? How to find an existing component that
                 fulfills the required functionalities? How to capture
                 the system personalization based on its constitutive
                 components' customization? To answer these questions,
                 this paper claims that components should be described
                 using three different forms at three development
                 stages: architecture specification, configuration and
                 assembly. However, no architecture description language
                 proposes such a detailed description for components
                 that supports such a three step component-based
                 development. This paper proposes a three-level Adl,
                 named Dedal, that enables the explicit and separate
                 definitions of component roles, component classes, and
                 component instances.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Freeman:2013:HLW,
  author =       "John Freeman and Jaakko J{\"a}rvi and Gabriel Foust",
  title =        "{HotDrink}: a library for {Web} user interfaces",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "80--83",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371413",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "HotDrink is a JavaScript library for constructing
                 forms, dialogs, and other common user interfaces for
                 Web applications. With HotDrink, instead of writing
                 event handlers, developers declare a ``view-model'' in
                 JavaScript and a set of ``bindings'' between the
                 view-model and the HTML elements comprising the view.
                 These specifications tend to be small, but they are
                 enough for HotDrink to provide a fully operational GUI
                 with multi-way dataflows, enabling/disabling of values,
                 activation/deactivation of commands, and data
                 validation. HotDrink implements these rich behaviors,
                 expected of high-quality user interfaces, as generic
                 reusable algorithms. This paper/tool demonstration
                 introduces developers to the HotDrink library by
                 stepping through the construction of an example web
                 application GUI. The library is a concrete realization
                 of our prior work on the ``property models'' approach
                 to declarative GUI programming. To encourage adoption
                 among developers, we have packaged the technology
                 following established web programming conventions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Riche:2013:PSA,
  author =       "T. L. Rich{\'e} and R. Gon{\c{c}}alves and B. Marker
                 and D. Batory",
  title =        "Pushouts in software architecture design",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "84--92",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371415",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A classical approach to program derivation is to
                 progressively extend a simple specification and then
                 incrementally refine it to an implementation. We claim
                 this approach is hard or impractical when reverse
                 engineering legacy software architectures. We present a
                 case study that shows optimizations and pushouts ---in
                 addition to refinements and extensions---are essential
                 for practical stepwise development of complex software
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Bagheri:2013:PSD,
  author =       "Hamid Bagheri and Kevin Sullivan",
  title =        "{Pol}: specification-driven synthesis of architectural
                 code frameworks for platform-based applications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "93--102",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371416",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Developing applications that use complex platforms for
                 functionalities such as authentication and messaging is
                 hard. Model-driven engineering promises to help, but
                 transformation systems are themselves hard to produce.
                 We contribute a new approach using constraint-based
                 synthesis of partial code frameworks that developers
                 complete by hand without the need for hand-coded
                 transformation systems. Rather, synthesis is driven by
                 formal, partial specifications of target platforms and
                 application architectures, and by design (code)
                 fragments encoding application-specific platform usage
                 patterns. We present results of an early evaluation
                 using the case study method to test hypotheses of
                 feasibility and potential industrial utility, using a
                 laboratory model of a nationwide health information
                 network as a subject system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Bauer:2013:FPA,
  author =       "Tim Bauer and Martin Erwig and Alan Fern and Jervis
                 Pinto",
  title =        "Faster program adaptation through reward attribution
                 inference",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "103--111",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371417",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the adaptation-based programming (ABP) paradigm,
                 programs may contain variable parts (function calls,
                 parameter values, etc.) that can be take a number of
                 different values. Programs also contain reward
                 statements with which a programmer can provide feedback
                 about how well a program is performing with respect to
                 achieving its goals (for example, achieving a high
                 score on some scale). By repeatedly running the
                 program, a machine learning component will, guided by
                 the rewards, gradually adjust the automatic choices
                 made in the variable program parts so that they
                 converge toward an optimal strategy. ABP is a method
                 for semi-automatic program generation in which the
                 choices and rewards offered by programmers allow
                 standard machine-learning techniques to explore a
                 design space defined by the programmer to find an
                 optimal instance of a program template. ABP effectively
                 provides a DSL that allows non-machine-learning experts
                 to exploit machine learning to generate self-optimizing
                 programs. Unfortunately, in many cases the placement
                 and structuring of choices and rewards can have a
                 detrimental effect on how an optimal solution to a
                 program-generation problem can be found. To address
                 this problem, we have developed a dataflow analysis
                 that computes influence tracks of choices and rewards.
                 This information can be exploited by an augmented
                 machine-learning technique to ignore misleading rewards
                 and to generally attribute rewards better to the
                 choices that have actually influenced them. Moreover,
                 this technique allows us to detect errors in the
                 adaptive program that might arise out of program
                 maintenance. Our evaluation shows that the dataflow
                 analysis can lead to improvements in performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Efftinge:2013:XID,
  author =       "Sven Efftinge and Moritz Eysholdt and Jan K{\"o}hnlein
                 and Sebastian Zarnekow and Robert von Massow and
                 Wilhelm Hasselbring and Michael Hanus",
  title =        "{Xbase}: implementing domain-specific languages for
                 {Java}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "112--121",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371419",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Xtext is an open-source framework for implementing
                 external, textual domain-specific languages (DSLs). So
                 far, most DSLs implemented with Xtext and similar tools
                 focus on structural aspects such as service
                 specifications and entities. Because behavioral aspects
                 are significantly more complicated to implement, they
                 are often delegated to general-purpose programming
                 languages. This approach introduces complex integration
                 patterns and the DSL's high level of abstraction is
                 compromised. We present Xbase as part of Xtext, an
                 expression language that can be reused via language
                 inheritance in any DSL implementation based on Xtext.
                 Xbase expressions provide both control structures and
                 program expressions in a uniform way. Xbase is
                 statically typed and tightly integrated with the Java
                 type system. Languages extending Xbase inherit the
                 syntax of a Java-like expression language as well as
                 language infrastructure components, including a parser,
                 an unparser, a linker, a compiler and an interpreter.
                 Furthermore, the framework provides integration into
                 the Eclipse IDE including debug and refactoring
                 support. The application of Xbase is presented by means
                 of a domain model language which serves as a tutorial
                 example and by the implementation of the programming
                 language Xtend. Xtend is a functional and
                 object-oriented general purpose language for the Java
                 Virtual Machine (JVM). It is built on top of Xbase
                 which is the reusable expression language that is the
                 foundation of Xtend.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Rafkind:2013:HSE,
  author =       "Jon Rafkind and Matthew Flatt",
  title =        "{Honu}: syntactic extension for algebraic notation
                 through enforestation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "122--131",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371420",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Honu is a new language that fuses traditional
                 algebraic notation (e.g., infix binary operators) with
                 Scheme-style language extensibility. A key element of
                 Honu's design is an enforestation parsing step, which
                 converts a flat stream of tokens into an
                 S-expression-like tree, in addition to the initial
                 ``read'' phase of parsing and interleaved with the
                 ``macro-expand'' phase. We present the design of Honu,
                 explain its parsing and macro-extension algorithm, and
                 show example syntactic extensions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Walkingshaw:2013:CMI,
  author =       "Eric Walkingshaw and Martin Erwig",
  title =        "A calculus for modeling and implementing variation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "3",
  pages =        "132--140",
  month =        mar,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2480361.2371421",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:18 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a formal calculus for modeling and
                 implementing variation in software. It unifies the
                 compositional and annotative approaches to feature
                 implementation and supports the development of
                 abstractions that can be used to directly relate
                 feature models to their implementation. Since the
                 compositional and annotative approaches are
                 complementary, the calculus enables implementers to use
                 the best combination of tools for the job and focus on
                 inherent feature interactions, rather than those
                 introduced by biases in the representation. The
                 calculus also supports the abstraction of recurring
                 variational patterns and provides a metaprogramming
                 platform for organizing variation in artifacts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Bond:2013:GDG,
  author =       "Michael Bond",
  title =        "{GPUDet}: a deterministic {GPU} architecture",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "1--12",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451118",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nondeterminism is a key challenge in developing
                 multithreaded applications. Even with the same input,
                 each execution of a multithreaded program may produce a
                 different output. This behavior complicates debugging
                 and limits one's ability to test for correctness. This
                 non-reproducibility situation is aggravated on
                 massively parallel architectures like graphics
                 processing units (GPUs) with thousands of concurrent
                 threads. We believe providing a deterministic
                 environment to ease debugging and testing of GPU
                 applications is essential to enable a broader class of
                 software to use GPUs. Many hardware and software
                 techniques have been proposed for providing determinism
                 on general-purpose multi-core processors. However,
                 these techniques are designed for small numbers of
                 threads. Scaling them to thousands of threads on a GPU
                 is a major challenge. This paper proposes a scalable
                 hardware mechanism, GPUDet, to provide determinism in
                 GPU architectures. In this paper we characterize the
                 existing deterministic and nondeterministic aspects of
                 current GPU execution models, and we use these
                 observations to inform GPUDet's design. For example,
                 GPUDet leverages the inherent determinism of the SIMD
                 hardware in GPUs to provide determinism within a
                 wavefront at no cost. GPUDet also exploits the Z-Buffer
                 Unit, an existing GPU hardware unit for graphics
                 rendering, to allow parallel out-of-order memory writes
                 to produce a deterministic output. Other optimizations
                 in GPUDet include deterministic parallel execution of
                 atomic operations and a workgroup-aware algorithm that
                 eliminates unnecessary global synchronizations. Our
                 simulation results indicate that GPUDet incurs only 2X
                 slowdown on average over a baseline nondeterministic
                 architecture, with runtime overheads as low as 4\% for
                 compute-bound applications, despite running GPU kernels
                 with thousands of threads. We also characterize the
                 sources of overhead for deterministic execution on GPUs
                 to provide insights for further optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Sung:2013:DEH,
  author =       "Hyojin Sung and Rakesh Komuravelli and Sarita V.
                 Adve",
  title =        "{DeNovoND}: efficient hardware support for disciplined
                 non-determinism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "13--26",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451119",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent work has shown that disciplined shared-memory
                 programming models that provide
                 deterministic-by-default semantics can simplify both
                 parallel software and hardware. Specifically, the
                 DeNovo hardware system has shown that the software
                 guarantees of such models (e.g., data-race-freedom and
                 explicit side-effects) can enable simpler, higher
                 performance, and more energy-efficient hardware than
                 the current state-of-the-art for deterministic
                 programs. Many applications, however, contain
                 non-deterministic parts; e.g., using lock
                 synchronization. For commercial hardware to exploit the
                 benefits of DeNovo, it is therefore necessary to extend
                 DeNovo to support non-deterministic applications. This
                 paper proposes DeNovoND, a system that supports
                 lock-based, disciplined non-determinism, with the
                 simplicity, performance, and energy benefits of DeNovo.
                 We use a combination of distributed queue-based locks
                 and access signatures to implement simple memory
                 consistency semantics for safe non-determinism, with a
                 coherence protocol that does not require transient
                 states, invalidation traffic, or directories, and does
                 not incur false sharing. The resulting system is
                 simpler, shows comparable or better execution time, and
                 has 33\% less network traffic on average (translating
                 directly into energy savings) relative to a
                 state-of-the-art invalidation-based protocol for 8
                 applications designed for lock synchronization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Wester:2013:PDR,
  author =       "Benjamin Wester and David Devecsery and Peter M. Chen
                 and Jason Flinn and Satish Narayanasamy",
  title =        "Parallelizing data race detection",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "27--38",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451120",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Detecting data races in multithreaded programs is a
                 crucial part of debugging such programs, but
                 traditional data race detectors are too slow to use
                 routinely. This paper shows how to speed up race
                 detection by spreading the work across multiple cores.
                 Our strategy relies on uniparallelism, which executes
                 time intervals of a program (called epochs ) in
                 parallel to provide scalability, but executes all
                 threads from a single epoch on a single core to
                 eliminate locking overhead. We use several techniques
                 to make parallelization effective: dividing race
                 detection into three phases, predicting a subset of the
                 analysis state, eliminating sequential work via
                 transitive reduction, and reducing the work needed to
                 maintain multiple versions of analysis via
                 factorization. We demonstrate our strategy by
                 parallelizing a happens-before detector and a
                 lockset-based detector. We find that uniparallelism can
                 significantly speed up data race detection. With 4x the
                 number of cores as the original application, our
                 strategy speeds up the median execution time by 4.4x
                 for a happens-before detector and 3.3x for a lockset
                 race detector. Even on the same number of cores as the
                 conventional detectors, the ability for uniparallelism
                 to elide analysis locks allows it to reduce the median
                 overhead by 13\% for a happens-before detector and 8\%
                 for a lockset detector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Lucia:2013:CEF,
  author =       "Brandon Lucia and Luis Ceze",
  title =        "Cooperative empirical failure avoidance for
                 multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "39--50",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451121",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrency errors in multithreaded programs are
                 difficult to find and fix. We propose Aviso, a system
                 for avoiding schedule-dependent failures. Aviso
                 monitors events during a program's execution and, when
                 a failure occurs, records a history of events from the
                 failing execution. It uses this history to generate
                 schedule constraints that perturb the order of events
                 in the execution and thereby avoids schedules that lead
                 to failures in future program executions. Aviso
                 leverages scenarios where many instances of the same
                 software run, using a statistical model of program
                 behavior and experimentation to determine which
                 constraints most effectively avoid failures. After
                 implementing Aviso, we showed that it decreased failure
                 rates for a variety of important desktop, server, and
                 cloud applications by orders of magnitude, with an
                 average overhead of less than 20\% and, in some cases,
                 as low as 5\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Goiri:2013:PGM,
  author =       "{\'I}{\~n}igo Goiri and William Katsak and Kien Le and
                 Thu D. Nguyen and Ricardo Bianchini",
  title =        "{Parasol} and {GreenSwitch}: managing datacenters
                 powered by renewable energy",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "51--64",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451123",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Several companies have recently announced plans to
                 build ``green'' datacenters, i.e. datacenters partially
                 or completely powered by renewable energy. These
                 datacenters will either generate their own renewable
                 energy or draw it directly from an existing nearby
                 plant. Besides reducing carbon footprints, renewable
                 energy can potentially reduce energy costs, reduce peak
                 power costs, or both. However, certain renewable fuels
                 are intermittent, which requires approaches for
                 tackling the energy supply variability. One approach is
                 to use batteries and/or the electrical grid as a backup
                 for the renewable energy. It may also be possible to
                 adapt the workload to match the renewable energy
                 supply. For highest benefits, green datacenter
                 operators must intelligently manage their workloads and
                 the sources of energy at their disposal. In this paper,
                 we first discuss the tradeoffs involved in building
                 green datacenters today and in the future. Second, we
                 present Parasol, a prototype green datacenter that we
                 have built as a research platform. Parasol comprises a
                 small container, a set of solar panels, a battery bank,
                 and a grid-tie. Third, we describe GreenSwitch, our
                 model-based approach for dynamically scheduling the
                 workload and selecting the source of energy to use. Our
                 real experiments with Parasol, GreenSwitch, and
                 MapReduce workloads demonstrate that intelligent
                 workload and energy source management can produce
                 significant cost reductions. Our results also isolate
                 the cost implications of peak power management, storing
                 energy on the grid, and the ability to delay the
                 MapReduce jobs. Finally, our results demonstrate that
                 careful workload and energy source management can
                 minimize the negative impact of electrical grid
                 outages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Shen:2013:PCF,
  author =       "Kai Shen and Arrvindh Shriraman and Sandhya Dwarkadas
                 and Xiao Zhang and Zhuan Chen",
  title =        "Power containers: an {OS} facility for fine-grained
                 power and energy management on multicore servers",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "65--76",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451124",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy efficiency and power capping are critical
                 concerns in server and cloud computing systems. They
                 face growing challenges due to dynamic power variations
                 from new client-directed web applications, as well as
                 complex behaviors due to multicore resource sharing and
                 hardware heterogeneity. This paper presents a new
                 operating system facility called ``power containers''
                 that accounts for and controls the power and energy
                 usage of individual fine-grained requests in multicore
                 servers. This facility relies on three key techniques
                 --- (1) online model that attributes multicore power
                 (including shared maintenance power) to concurrently
                 running tasks, (2) alignment of actual power
                 measurements and model estimates to enable online model
                 recalibration, and (3) on-the-fly
                 application-transparent request tracking in multi-stage
                 servers to isolate the power and energy contributions
                 and customize per-request control. Our mechanisms
                 enable new multicore server management capabilities
                 including fair power capping that only penalizes
                 power-hungry requests, and energy-aware request
                 distribution between heterogeneous servers. Our
                 evaluation uses three multicore processors (Intel
                 Woodcrest, Westmere, and SandyBridge) and a variety of
                 server and cloud computing (Google App Engine)
                 workloads. Our results demonstrate the high accuracy of
                 our request power accounting (no more than 11\% errors)
                 and the effectiveness of container-enabled power virus
                 isolation and throttling. Our request distribution case
                 study shows up to 25\% energy saving compared to an
                 alternative approach that recognizes machine
                 heterogeneity but not fine-grained workload affinity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Delimitrou:2013:PQA,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "{Paragon}: {QoS}-aware scheduling for heterogeneous
                 datacenters",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "77--88",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451125",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large-scale datacenters (DCs) host tens of thousands
                 of diverse applications each day. However, interference
                 between colocated workloads and the difficulty to match
                 applications to one of the many hardware platforms
                 available can degrade performance, violating the
                 quality of service (QoS) guarantees that many cloud
                 workloads require. While previous work has identified
                 the impact of heterogeneity and interference, existing
                 solutions are computationally intensive, cannot be
                 applied online and do not scale beyond few
                 applications. We present Paragon, an online and
                 scalable DC scheduler that is heterogeneity and
                 interference-aware. Paragon is derived from robust
                 analytical methods and instead of profiling each
                 application in detail, it leverages information the
                 system already has about applications it has previously
                 seen. It uses collaborative filtering techniques to
                 quickly and accurately classify an unknown, incoming
                 workload with respect to heterogeneity and interference
                 in multiple shared resources, by identifying
                 similarities to previously scheduled applications. The
                 classification allows Paragon to greedily schedule
                 applications in a manner that minimizes interference
                 and maximizes server utilization. Paragon scales to
                 tens of thousands of servers with marginal scheduling
                 overheads in terms of time or state. We evaluate
                 Paragon with a wide range of workload scenarios, on
                 both small and large-scale systems, including 1,000
                 servers on EC2. For a 2,500-workload scenario, Paragon
                 enforces performance guarantees for 91\% of
                 applications, while significantly improving
                 utilization. In comparison, heterogeneity-oblivious,
                 interference-oblivious and least-loaded schedulers only
                 provide similar guarantees for 14\%, 11\% and 3\% of
                 workloads. The differences are more striking in
                 oversubscribed scenarios where resource efficiency is
                 more critical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Tang:2013:RRS,
  author =       "Lingjia Tang and Jason Mars and Wei Wang and Tanima
                 Dey and Mary Lou Soffa",
  title =        "{ReQoS}: reactive static\slash dynamic compilation for
                 {QoS} in warehouse scale computers",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "89--100",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451126",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As multicore processors with expanding core counts
                 continue to dominate the server market, the overall
                 utilization of the class of datacenters known as
                 warehouse scale computers (WSCs) depends heavily on
                 colocation of multiple workloads on each server to take
                 advantage of the computational power provided by modern
                 processors. However, many of the applications running
                 in WSCs, such as Web search, are user-facing and have
                 quality of service (QoS) requirements. When multiple
                 applications are co-located on a multicore machine,
                 contention for shared memory resources threatens
                 application QoS as severe cross-core performance
                 interference may occur. WSC operators are left with two
                 options: either disregard QoS to maximize WSC
                 utilization, or disallow the co-location of
                 high-priority user-facing applications with other
                 applications, resulting in low machine utilization and
                 millions of dollars wasted. This paper presents ReQoS,
                 a static/dynamic compilation approach that enables
                 low-priority applications to adaptively manipulate
                 their own contentiousness to ensure the QoS of
                 high-priority co-runners. ReQoS is composed of a
                 profile guided compilation technique that identifies
                 and inserts markers in contentious code regions in
                 low-priority applications, and a lightweight runtime
                 that monitors the QoS of high-priority applications and
                 reactively reduces the pressure low-priority
                 applications generate to the memory subsystem when
                 cross-core interference is detected. In this work, we
                 show that ReQoS can accurately diagnose contention and
                 significantly reduce performance interference to ensure
                 application QoS. Applying ReQoS to SPEC2006 and
                 SmashBench workloads on real multicore machines, we are
                 able to improve machine utilization by more than 70\%
                 in many cases, and more than 50\% on average, while
                 enforcing a 90\% QoS threshold. We are also able to
                 improve the energy efficiency of modern multicore
                 machines by 47\% on average over a policy of
                 disallowing co-locations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Arulraj:2013:PRS,
  author =       "Joy Arulraj and Po-Chun Chang and Guoliang Jin and
                 Shan Lu",
  title =        "Production-run software failure diagnosis via hardware
                 performance counters",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "101--112",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451128",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Sequential and concurrency bugs are widespread in
                 deployed software. They cause severe failures and huge
                 financial loss during production runs. Tools that
                 diagnose production-run failures with low overhead are
                 needed. The state-of-the-art diagnosis techniques use
                 software instrumentation to sample program properties
                 at run time and use off-line statistical analysis to
                 identify properties most correlated with failures.
                 Although promising, these techniques suffer from high
                 run-time overhead, which is sometimes over 100\%, for
                 concurrency-bug failure diagnosis and hence are not
                 suitable for production-run usage. We present PBI, a
                 system that uses existing hardware performance counters
                 to diagnose production-run failures caused by
                 sequential and concurrency bugs with low overhead. PBI
                 is designed based on several key observations. First, a
                 few widely supported performance counter events can
                 reflect a wide variety of common software bugs and can
                 be monitored by hardware with almost no overhead.
                 Second, the counter overflow interrupt supported by
                 existing hardware and operating systems provides a
                 natural and effective mechanism to conduct event
                 sampling at user level. Third, the noise and
                 non-determinism in interrupt delivery complements well
                 with statistical processing. We evaluate PBI using 13
                 real-world concurrency and sequential bugs from
                 representative open-source server, client, and utility
                 programs, and 10 bugs from a widely used
                 software-testing benchmark. Quantitatively, PBI can
                 effectively diagnose failures caused by these bugs with
                 a small overhead that is never higher than 10\%.
                 Qualitatively, PBI does not require any change to
                 software and presents a novel use of existing hardware
                 performance counters.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Zhang:2013:CFC,
  author =       "Wei Zhang and Marc de Kruijf and Ang Li and Shan Lu
                 and Karthikeyan Sankaralingam",
  title =        "{ConAir}: featherweight concurrency bug recovery via
                 single-threaded idempotent execution",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "113--126",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451129",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many concurrency bugs are hidden in deployed software
                 and cause severe failures for end-users. When they
                 finally manifest and become known by developers, they
                 are difficult to fix correctly. To support end-users,
                 we need techniques that help software survive hidden
                 concurrency bugs during production runs. To help
                 developers, we need techniques that fix exposed
                 concurrency bugs. The state-of-the-art techniques on
                 concurrency-bug fixing and survival only satisfy a
                 subset of four important properties: compatibility,
                 correctness, generality, and performance.We aim to
                 develop a system that satisfies all of these four
                 properties. To achieve this goal, we leverage two
                 observations: (1) rolling back a single thread is
                 sufficient to recover from most concurrency-bug
                 failures; (2) reexecuting an idempotent region, which
                 requires no memory-state checkpoint, is sufficient to
                 recover from many concurrency-bug failures. Our system
                 ConAir includes a static analysis component that
                 automatically identifies potential failure sites, a
                 static analysis component that automatically identifies
                 the idempotent code regions around every failure site,
                 and a code-transformation component that inserts
                 rollback-recovery code around the identified idempotent
                 regions. We evaluated ConAir on 10 real-world
                 concurrency bugs in widely used C/C++ open-source
                 applications. These bugs cover different types of
                 failure symptoms and root causes. Quantitatively,
                 ConAir helps software survive failures caused by all of
                 these bugs with negligible run-time overhead ($< 1\%$)
                 and short recovery time. Qualitatively, ConAir can help
                 recover from failures caused by unknown bugs. It
                 guarantees that program semantics remain unchanged and
                 requires no change to operating systems or hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Viennot:2013:TMR,
  author =       "Nicolas Viennot and Siddharth Nair and Jason Nieh",
  title =        "Transparent mutable replay for multicore debugging and
                 patch validation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "127--138",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451130",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Dora, a mutable record-replay system which
                 allows a recorded execution of an application to be
                 replayed with a modified version of the application.
                 This feature, not available in previous record-replay
                 systems, enables powerful new functionality. In
                 particular, Dora can help reproduce, diagnose, and fix
                 software bugs by replaying a version of a recorded
                 application that is recompiled with debugging
                 information, reconfigured to produce verbose log
                 output, modified to include additional print
                 statements, or patched to fix a bug. Dora uses
                 lightweight operating system mechanisms to record an
                 application execution by capturing nondeterministic
                 events to a log without imposing unnecessary timing and
                 ordering constraints. It replays the log using a
                 modified version of the application even in the
                 presence of added, deleted, or modified operations that
                 do not match events in the log. Dora searches for a
                 replay that minimizes differences between the log and
                 the replayed execution of the modified program. If
                 there are no modifications, Dora provides deterministic
                 replay of the unmodified program. We have implemented a
                 Linux prototype which provides transparent mutable
                 replay without recompiling or relinking applications.
                 We show that Dora is useful for reproducing,
                 diagnosing, and fixing software bugs in real-world
                 applications, including Apache and MySQL. Our results
                 show that Dora (1) captures bugs and replays them with
                 applications modified or reconfigured to produce
                 additional debugging output for root cause diagnosis,
                 (2) captures exploits and replays them with patched
                 applications to validate that the patches successfully
                 eliminate vulnerabilities, (3) records production
                 workloads and replays them with patched applications to
                 validate patches with realistic workloads, and (4)
                 maintains low recording overhead on commodity multicore
                 hardware, making it suitable for production systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Sahoo:2013:ULI,
  author =       "Swarup Kumar Sahoo and John Criswell and Chase Geigle
                 and Vikram Adve",
  title =        "Using likely invariants for automated software fault
                 localization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "139--152",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451131",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose an automatic diagnosis technique for
                 isolating the root cause(s) of software failures. We
                 use likely program invariants, automatically generated
                 using correct inputs that are close to the
                 fault-triggering input, to select a set of candidate
                 program locations which are possible root causes. We
                 then trim the set of candidate root causes using
                 software-implemented dynamic backwards slicing, plus
                 two new filtering heuristics: dependence filtering, and
                 filtering via multiple failing inputs that are also
                 close to the failing input. Experimental results on
                 reported software bugs of three large open-source
                 servers show that we are able to narrow down the number
                 of candidate bug locations to between 5 and 17 program
                 expressions, even in programs that are hundreds of
                 thousands of lines long.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Paulos:2013:REA,
  author =       "Eric Paulos",
  title =        "The rise of the expert amateur: {DIY} culture and the
                 evolution of computer science",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "153--154",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451133",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We are at an important technological inflection point.
                 Most of our computing systems have been designed and
                 built by professionally trained experts (i.e. us ---
                 computer scientists, engineers, and designers) for use
                 in specific domains and to solve explicit prob-lems.
                 Artifacts often called ``user manuals'' traditionally
                 prescribed the appropriate usage of these tools and
                 implied an acceptable etiquette for interaction and
                 experience. A fringe group of individuals usually
                 labeled ``hackers'' or ``amateurs'' or ``makers'' have
                 challenged this producer-consumer model of technology
                 by creating novel hardware and software features to
                 ``improve'' our research and products while a similar
                 creative group of technicians called ``artists'' have
                 redirected the techniques, tools, and tenets of
                 accepted technological usage away from their typical
                 manifestations in practicality and product. Over time
                 the technological artifacts of these fringe groups and
                 the support for their rhetoric have gained them a
                 foothold into computing culture and eroded the
                 established power discontinuities within the practice
                 of computing research. We now expect our computing
                 tools to be driven by an architecture of open
                 participation and democracy that encourages users to
                 add value to their tools and applications as they use
                 them. Similarly, the bar for enabling the design of
                 novel, personal computing systems and ``hardware
                 remixes'' has fallen to the point where many
                 non-experts and novices are readily embracing and
                 creating fascinating and ingenious computing artifacts
                 outside of our official and traditionally sanctioned
                 academic and industrial research communities. But how
                 have we as ``expert'' practitioners been influencing
                 this discussion? By constructing a practice around the
                 design and development of technology for task based and
                 problem solving applications, we have unintentionally
                 established such work as the status quo for the human
                 computing experience. We have failed in our duty to
                 open up alternate forums for technology to express
                 itself and touch our lives beyond productivity and
                 efficiency. Blinded by our quest for ``smart
                 technologies'' we have forgotten to contemplate the
                 design of technologies to inspire us to be smarter,
                 more curious, and more inquisitive. We owe it to
                 ourselves to rethink the impact we desire to have on
                 this historic moment in computing culture. We must
                 choose to participate in and perhaps lead a dialogue
                 that heralds an expansive new acceptable practice of
                 designing to enable participation by experts and
                 non-experts alike. We are in the milieu of the rise of
                 the ``expert amateur''. We must change our mantra ---
                 not just performance, completeness, and usability but
                 openness, usefulness and relevancy to our world, its
                 citizens, and our environment. This talk will explore
                 elements of the DIY and maker culture and its relevancy
                 to research questions across computational hardware,
                 languages, and systems. Ultimately, this talk will
                 outline and argue for expanding the design territory
                 and potential opportunities for all of us to
                 collaborate and benefit as a society from this cultural
                 movement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Raghavan:2013:CSH,
  author =       "Arun Raghavan and Laurel Emurian and Lei Shao and
                 Marios Papaefthymiou and Kevin P. Pipe and Thomas F.
                 Wenisch and Milo M. K. Martin",
  title =        "Computational sprinting on a hardware\slash software
                 testbed",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "155--166",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451135",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "CMOS scaling trends have led to an inflection point
                 where thermal constraints (especially in mobile devices
                 that employ only passive cooling) preclude sustained
                 operation of all transistors on a chip --- a phenomenon
                 called ``dark silicon.'' Recent research proposed
                 computational sprinting --- exceeding sustainable
                 thermal limits for short intervals --- to improve
                 responsiveness in light of the bursty computation
                 demands of many media-rich interactive mobile
                 applications. Computational sprinting improves
                 responsiveness by activating reserve cores (parallel
                 sprinting) and/or boosting frequency/voltage (frequency
                 sprinting) to power levels that far exceed the system's
                 sustainable cooling capabilities, relying on thermal
                 capacitance to buffer heat. Prior work analyzed the
                 feasibility of sprinting through modeling and
                 simulation. In this work, we investigate sprinting
                 using a hardware/software testbed. First, we study
                 unabridged sprints, wherein the computation completes
                 before temperature becomes critical, demonstrating a
                 6.3x responsiveness gain, and a 6\% energy efficiency
                 improvement by racing to idle. We then analyze
                 truncated sprints, wherein our software runtime system
                 must intervene to prevent overheating by throttling
                 parallelism and frequency before the computation is
                 complete. To avoid oversubscription penalties (context
                 switching inefficiencies after a truncated parallel
                 sprint), we develop a sprint-aware task-based parallel
                 runtime. We find that maximal-intensity sprinting is
                 not always best, introduce the concept of sprint
                 pacing, and evaluate an adaptive policy for selecting
                 sprint intensity. We report initial results using a
                 phase change heat sink to extend maximum sprint
                 duration. Finally, we demonstrate that a
                 sprint-and-rest operating regime can actually
                 outperform thermally-limited sustained execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Ahn:2013:DAS,
  author =       "Wonsun Ahn and Yuelu Duan and Josep Torrellas",
  title =        "{DeAliaser}: alias speculation using atomic region
                 support",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "167--180",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451136",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Alias analysis is a critical component in many
                 compiler optimizations. A promising approach to reduce
                 the complexity of alias analysis is to use speculation.
                 The approach consists of performing optimizations
                 assuming the alias relationships that are true most of
                 the time, and repairing the code when such
                 relationships are found not to hold through runtime
                 checks. This paper proposes a general alias speculation
                 scheme that leverages upcoming hardware support for
                 transactions with the help of some ISA extensions. The
                 ability of transactions to checkpoint and roll back
                 frees the compiler to pursue aggressive optimizations
                 without having to worry about recovery code. Also,
                 exposing the memory conflict detection hardware in
                 transactions to software allows runtime checking of
                 aliases with little or no overhead. We test the
                 potential of the novel alias speculation approach with
                 Loop Invariant Code Motion (LICM), Global Value
                 Numbering (GVN), and Partial Redundancy Elimination
                 (PRE) optimization passes. On average, they are shown
                 to reduce program execution time by 9\% in SPEC FP2006
                 applications and 3\% in SPEC INT2006 applications over
                 the alias analysis of a state-of-the-art compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Park:2013:RCH,
  author =       "Heekwon Park and Seungjae Baek and Jongmoo Choi and
                 Donghee Lee and Sam H. Noh",
  title =        "Regularities considered harmful: forcing randomness to
                 memory accesses to reduce row buffer conflicts for
                 multi-core, multi-bank systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "181--192",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451137",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a novel kernel-level memory allocator,
                 called M$^3$ (M-cube, Multi-core Multi-bank Memory
                 allocator), that has the following two features. First,
                 it introduces and makes use of a notion of a memory
                 container, which is defined as a unit of memory that
                 comprises the minimum number of page frames that can
                 cover all the banks of the memory organization, by
                 exclusively assigning a container to a core so that
                 each core achieves bank parallelism as much as
                 possible. Second, it orchestrates page frame allocation
                 so that pages that threads access are dispersed
                 randomly across multiple banks so that each thread's
                 access pattern is randomized. The development of M$^3$
                 is based on a tool that we develop to fully understand
                 the architectural characteristics of the underlying
                 memory organization. Using an extension of this tool,
                 we observe that the same application that accesses
                 pages in a random manner outperforms one that accesses
                 pages in a regular pattern such as sequential or same
                 ordered accesses. This is because such randomized
                 accesses reduces inter-thread access interference on
                 the row-buffer in memory banks. We implement M$^3$ in
                 the Linux kernel version 2.6.32 on the Intel Xeon
                 system that has 16 cores and 32GB DRAM. Performance
                 evaluation with various workloads show that M$^3$
                 improves the overall performance for memory intensive
                 benchmarks by up to 85\% with an average of about
                 40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Honarmand:2013:CUA,
  author =       "Nima Honarmand and Nathan Dautenhahn and Josep
                 Torrellas and Samuel T. King and Gilles Pokam and
                 Cristiano Pereira",
  title =        "{Cyrus}: unintrusive application-level record-replay
                 for replay parallelism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "193--206",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451138",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Architectures for deterministic record-replay (R\&R)
                 of multithreaded code are attractive for program
                 debugging, intrusion analysis, and fault-tolerance
                 uses. However, very few of the proposed designs have
                 focused on maximizing replay speed --- a key enabling
                 property of these systems. The few efforts that focus
                 on replay speed require intrusive hardware or software
                 modifications, or target whole-system R\&R rather than
                 the more useful application-level R\&R. This paper
                 presents the first hardware-based scheme for
                 unintrusive, application-level R\&R that explicitly
                 targets high replay speed. Our scheme, called Cyrus,
                 requires no modification to commodity snoopy cache
                 coherence. It introduces the concept of an on-the-fly
                 software Backend Pass during recording which, as the
                 log is being generated, transforms it for high replay
                 parallelism. This pass also fixes-up the log, and can
                 flexibly trade-off replay parallelism for log size. We
                 analyze the performance of Cyrus using full system (OS
                 plus hardware) simulation. Our results show that Cyrus
                 has negligible recording overhead. In addition, for
                 8-processor runs of SPLASH-2, Cyrus attains an average
                 replay parallelism of 5, and a replay speed that is, on
                 average, only about 50\% lower than the recording
                 speed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{deOliveira:2013:WYS,
  author =       "Augusto Born de Oliveira and Sebastian Fischmeister
                 and Amer Diwan and Matthias Hauswirth and Peter F.
                 Sweeney",
  title =        "Why you should care about quantile regression",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "207--218",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451140",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Research has shown that correctly conducting and
                 analysing computer performance experiments is
                 difficult. This paper investigates what is necessary to
                 conduct successful computer performance evaluation by
                 attempting to repeat a prior experiment: the comparison
                 between two Linux schedulers. In our efforts, we found
                 that exploring an experimental space through a series
                 of incremental experiments can be inconclusive, and
                 there may be no indication of how much experimentation
                 will be enough. Analysis of variance (ANOVA), a
                 traditional analysis method, is able to partly solve
                 the problems with the previous approach, but we
                 demonstrate that ANOVA can be insufficient for proper
                 analysis due to the requirements it imposes on the
                 data. Finally, we demonstrate the successful
                 application of quantile regression, a recent
                 development in statistics, to computer performance
                 experiments. Quantile regression can provide more
                 insight into the experiment than ANOVA, with the
                 additional benefit of being applicable to data from any
                 distribution. This property makes it especially useful
                 in our field, since non-normally distributed data is
                 common in computer experiments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Curtsinger:2013:SSS,
  author =       "Charlie Curtsinger and Emery D. Berger",
  title =        "{STABILIZER}: statistically sound performance
                 evaluation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "219--228",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451141",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Researchers and software developers require effective
                 performance evaluation. Researchers must evaluate
                 optimizations or measure overhead. Software developers
                 use automatic performance regression tests to discover
                 when changes improve or degrade performance. The
                 standard methodology is to compare execution times
                 before and after applying changes. Unfortunately,
                 modern architectural features make this approach
                 unsound. Statistically sound evaluation requires
                 multiple samples to test whether one can or cannot
                 (with high confidence) reject the null hypothesis that
                 results are the same before and after. However, caches
                 and branch predictors make performance dependent on
                 machine-specific parameters and the exact layout of
                 code, stack frames, and heap objects. A single binary
                 constitutes just one sample from the space of program
                 layouts, regardless of the number of runs. Since
                 compiler optimizations and code changes also alter
                 layout, it is currently impossible to distinguish the
                 impact of an optimization from that of its layout
                 effects. This paper presents Stabilizer, a system that
                 enables the use of the powerful statistical techniques
                 required for sound performance evaluation on modern
                 architectures. Stabilizer forces executions to sample
                 the space of memory configurations by repeatedly
                 re-randomizing layouts of code, stack, and heap objects
                 at runtime. Stabilizer thus makes it possible to
                 control for layout effects. Re-randomization also
                 ensures that layout effects follow a Gaussian
                 distribution, enabling the use of statistical tests
                 like ANOVA. We demonstrate Stabilizer's efficiency
                 ($< 7\%$ median overhead) and its effectiveness by
                 evaluating the impact of LLVM's optimizations on the
                 SPEC CPU2006 benchmark suite. We find that, while -O2
                 has a significant impact relative to -O1, the
                 performance impact of -O3 over -O2 optimizations is
                 indistinguishable from random noise.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Gidra:2013:SSS,
  author =       "Lokesh Gidra and Ga{\"e}l Thomas and Julien Sopena and
                 Marc Shapiro",
  title =        "A study of the scalability of stop-the-world garbage
                 collectors on multicores",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "229--240",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451142",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large-scale multicore architectures create new
                 challenges for garbage collectors (GCs). In particular,
                 throughput-oriented stop-the-world algorithms
                 demonstrate good performance with a small number of
                 cores, but have been shown to degrade badly beyond
                 approximately 8 cores on a 48-core with OpenJDK 7. This
                 negative result raises the question whether the
                 stop-the-world design has intrinsic limitations that
                 would require a radically different approach. Our study
                 suggests that the answer is no, and that there is no
                 compelling scalability reason to discard the existing
                 highly-optimised throughput-oriented GC code on
                 contemporary hardware. This paper studies the default
                 throughput-oriented garbage collector of OpenJDK 7,
                 called Parallel Scavenge. We identify its bottlenecks,
                 and show how to eliminate them using well-established
                 parallel programming techniques. On the SPECjbb2005,
                 SPECjvm2008 and DaCapo 9.12 benchmarks, the improved GC
                 matches the performance of Parallel Scavenge at low
                 core count, but scales well, up to 48 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{McFarlin:2013:DDO,
  author =       "Daniel S. McFarlin and Charles Tucker and Craig
                 Zilles",
  title =        "Discerning the dominant out-of-order performance
                 advantage: is it speculation or dynamism?",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "241--252",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451143",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we set out to study the performance
                 advantages of an Out-of-Order (OOO) processor relative
                 to in-order processors with similar execution
                 resources. In particular, we try to tease apart the
                 performance contributions from two sources: the
                 improved sched- ules enabled by OOO hardware
                 speculation support and its ability to generate
                 different schedules on different occurrences of the
                 same instructions based on operand and functional unit
                 availability. We find that the ability to express good
                 static schedules achieves the bulk of the speedup
                 resulting from OOO. Specifically, of the 53\% speedup
                 achieved by OOO relative to a similarly provisioned in-
                 order machine, we find that 88\% of that speedup can be
                 achieved by using a single ``best'' static schedule as
                 suggested by observing an OOO schedule of the code. We
                 discuss the ISA mechanisms that would be required to
                 express these static schedules. Furthermore, we find
                 that the benefits of dynamism largely come from two
                 kinds of events that influence the application's
                 critical path: load instructions that miss in the cache
                 only part of the time and branch mispredictions. We
                 find that much of the benefit of OOO dynamism can be
                 achieved by the potentially simpler task of addressing
                 these two behaviors directly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Checkoway:2013:IAW,
  author =       "Stephen Checkoway and Hovav Shacham",
  title =        "{Iago} attacks: why the system call {API} is a bad
                 untrusted {RPC} interface",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "253--264",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451145",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In recent years, researchers have proposed systems for
                 running trusted code on an untrusted operating system.
                 Protection mechanisms deployed by such systems keep a
                 malicious kernel from directly manipulating a trusted
                 application's state. Under such systems, the
                 application and kernel are, conceptually, peers, and
                 the system call API defines an RPC interface between
                 them. We introduce Iago attacks, attacks that a
                 malicious kernel can mount in this model. We show how a
                 carefully chosen sequence of integer return values to
                 Linux system calls can lead a supposedly protected
                 process to act against its interests, and even to
                 undertake arbitrary computation at the malicious
                 kernel's behest. Iago attacks are evidence that
                 protecting applications from malicious kernels is more
                 difficult than previously realized.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Hofmann:2013:ISA,
  author =       "Owen S. Hofmann and Sangman Kim and Alan M. Dunn and
                 Michael Z. Lee and Emmett Witchel",
  title =        "{InkTag}: secure applications on an untrusted
                 operating system",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "265--278",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451146",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "InkTag is a virtualization-based architecture that
                 gives strong safety guarantees to high-assurance
                 processes even in the presence of a malicious operating
                 system. InkTag advances the state of the art in
                 untrusted operating systems in both the design of its
                 hypervisor and in the ability to run useful
                 applications without trusting the operating system. We
                 introduce paraverification, a technique that simplifies
                 the InkTag hypervisor by forcing the untrusted
                 operating system to participate in its own
                 verification. Attribute-based access control allows
                 trusted applications to create decentralized access
                 control policies. InkTag is also the first system of
                 its kind to ensure consistency between secure data and
                 metadata, ensuring recoverability in the face of system
                 crashes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Giuffrida:2013:SAL,
  author =       "Cristiano Giuffrida and Anton Kuijsten and Andrew S.
                 Tanenbaum",
  title =        "Safe and automatic live update for operating systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "279--292",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451147",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Increasingly many systems have to run all the time
                 with no downtime allowed. Consider, for example,
                 systems controlling electric power plants and e-banking
                 servers. Nevertheless, security patches and a constant
                 stream of new operating system versions need to be
                 deployed without stopping running programs. These
                 factors naturally lead to a pressing demand for live
                 update---upgrading all or parts of the operating system
                 without rebooting. Unfortunately, existing solutions
                 require significant manual intervention and thus work
                 reliably only for small operating system patches. In
                 this paper, we describe an automated system for live
                 update that can safely and automatically handle major
                 upgrades without rebooting. We have implemented our
                 ideas in Proteos, a new research OS designed with live
                 update in mind. Proteos relies on system support and
                 nonintrusive instrumentation to handle even very
                 complex updates with minimal manual effort. The key
                 novelty is the idea of state quiescence, which allows
                 updates to happen only in safe and predictable system
                 states. A second novelty is the ability to
                 automatically perform transactional live updates at the
                 process level, ensuring a safe and stable update
                 process. Unlike prior solutions, Proteos supports
                 automated state transfer, state checking, and hot
                 rollback. We have evaluated Proteos on 50 real updates
                 and on novel live update scenarios. The results show
                 that our techniques can effectively support both simple
                 and complex updates, while outperforming prior
                 solutions in terms of flexibility, security,
                 reliability, and stability of the update process.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Mai:2013:VSI,
  author =       "Haohui Mai and Edgar Pek and Hui Xue and Samuel
                 Talmadge King and Parthasarathy Madhusudan",
  title =        "Verifying security invariants in {ExpressOS}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "293--304",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451148",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Security for applications running on mobile devices is
                 important. In this paper we present ExpressOS, a new OS
                 for enabling high-assurance applications to run on
                 commodity mobile devices securely. Our main
                 contributions are a new OS architecture and our use of
                 formal methods for proving key security invariants
                 about our implementation. In our use of formal methods,
                 we focus solely on proving that our OS implements our
                 security invariants correctly, rather than striving for
                 full functional correctness, requiring significantly
                 less verification effort while still proving the
                 security relevant aspects of our system. We built
                 ExpressOS, analyzed its security, and tested its
                 performance. Our evaluation shows that the performance
                 of ExpressOS is comparable to an Android-based system.
                 In one test, we ran the same web browser on ExpressOS
                 and on an Android-based system, and found that
                 ExpressOS adds 16\% overhead on average to the page
                 load latency time for nine popular web sites.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Schkufza:2013:SS,
  author =       "Eric Schkufza and Rahul Sharma and Alex Aiken",
  title =        "Stochastic superoptimization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "305--316",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451150",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We formulate the loop-free binary superoptimization
                 task as a stochastic search problem. The competing
                 constraints of transformation correctness and
                 performance improvement are encoded as terms in a cost
                 function, and a Markov Chain Monte Carlo sampler is
                 used to rapidly explore the space of all possible
                 programs to find one that is an optimization of a given
                 target program. Although our method sacrifices
                 completeness, the scope of programs we are able to
                 consider, and the resulting quality of the programs
                 that we produce, far exceed those of existing
                 superoptimizers. Beginning from binaries compiled by
                 llvm -O0 for 64-bit x86, our prototype implementation,
                 STOKE, is able to produce programs which either match
                 or outperform the code produced by gcc -O3, icc -O3,
                 and in some cases, expert handwritten assembly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Schulte:2013:ARB,
  author =       "Eric Schulte and Jonathan DiLorenzo and Westley Weimer
                 and Stephanie Forrest",
  title =        "Automated repair of binary and assembly programs for
                 cooperating embedded devices",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "317--328",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451151",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a method for automatically repairing
                 arbitrary software defects in embedded systems, which
                 have limited memory, disk and CPU capacities, but exist
                 in great numbers. We extend evolutionary computation
                 (EC) algorithms that search for valid repairs at the
                 source code level to assembly and ELF format binaries,
                 compensating for limited system resources with several
                 algorithmic innovations. Our method does not require
                 access to the source code or build toolchain of the
                 software under repair, does not require program
                 instrumentation, specialized execution environments, or
                 virtual machines, or prior knowledge of the bug type.
                 We repair defects in ARM and x86 assembly as well as
                 ELF binaries, observing decreases of 86\% in memory and
                 95\% in disk requirements, with 62\% decrease in repair
                 time, compared to similar source-level techniques.
                 These advances allow repairs previously possible only
                 with C source code to be applied to any ARM or x86
                 assembly or ELF executable. Efficiency gains are
                 achieved by introducing stochastic fault localization,
                 with much lower overhead than comparable deterministic
                 methods, and low-level program representations. When
                 distributed over multiple devices, our algorithm finds
                 repairs faster than predicted by naive parallelism.
                 Four devices using our approach are five times more
                 efficient than a single device because of our
                 collaboration model. The algorithm is implemented on
                 Nokia N900 smartphones, with inter-phone communication
                 fitting in 900 bytes sent in 7 SMS text messages per
                 device per repair on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Cui:2013:VSR,
  author =       "Heming Cui and Gang Hu and Jingyue Wu and Junfeng
                 Yang",
  title =        "Verifying systems rules using rule-directed symbolic
                 execution",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "329--342",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451152",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Systems code must obey many rules, such as ``opened
                 files must be closed.'' One approach to verifying rules
                 is static analysis, but this technique cannot infer
                 precise runtime effects of code, often emitting many
                 false positives. An alternative is symbolic execution,
                 a technique that verifies program paths over all inputs
                 up to a bounded size. However, when applied to verify
                 rules, existing symbolic execution systems often
                 blindly explore many redundant program paths while
                 missing relevant ones that may contain bugs. Our key
                 insight is that only a small portion of paths are
                 relevant to rules, and the rest (majority) of paths are
                 irrelevant and do not need to be verified. Based on
                 this insight, we create WOODPECKER, a new symbolic
                 execution system for effectively checking rules on
                 systems programs. It provides a set of builtin checkers
                 for common rules, and an interface for users to easily
                 check new rules. It directs symbolic execution toward
                 the program paths relevant to a checked rule, and
                 soundly prunes redundant paths, exponentially speeding
                 up symbolic execution. It is designed to be
                 heuristic-agnostic, enabling users to leverage existing
                 powerful search heuristics. Evaluation on 136 systems
                 programs totaling 545K lines of code, including some of
                 the most widely used programs, shows that, with a time
                 limit of typically just one hour for each verification
                 run, WOODPECKER effectively verifies 28.7\% of the
                 program and rule combinations over bounded input,
                 whereas an existing symbolic execution system KLEE
                 verifies only 8.5\%. For the remaining combinations,
                 WOODPECKER verifies 4.6 times as many relevant paths as
                 KLEE. With a longer time limit, WOODPECKER verifies
                 much more paths than KLEE, e.g., 17 times as many with
                 a fourhour limit. WOODPECKER detects 113 rule
                 violations, including 10 serious data loss errors with
                 2 most serious ones already confirmed by the
                 corresponding developers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Xiang:2013:HHO,
  author =       "Xiaoya Xiang and Chen Ding and Hao Luo and Bin Bao",
  title =        "{HOTL}: a higher order theory of locality",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "343--356",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451153",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The locality metrics are many, for example, miss ratio
                 to test performance, data footprint to manage cache
                 sharing, and reuse distance to analyze and optimize a
                 program. It is unclear how different metrics are
                 related, whether one subsumes another, and what
                 combination may represent locality completely. This
                 paper first derives a set of formulas to convert
                 between five locality metrics and gives the condition
                 for correctness. The transformation is analogous to
                 differentiation and integration used to convert between
                 higher order polynomials. As a result, these metrics
                 can be assigned an order and organized into a
                 hierarchy. Using the new theory, the paper then
                 develops two techniques: one measures the locality in
                 real time without special hardware support, and the
                 other predicts multicore cache interference without
                 parallel testing. The paper evaluates them using
                 sequential and parallel programs as well as for a
                 parallel mix of sequential programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Kang:2013:HPP,
  author =       "Hui Kang and Jennifer L. Wong",
  title =        "To hardware prefetch or not to prefetch?: a
                 virtualized environment study and core binding
                 approach",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "357--368",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451155",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most hardware and software vendors suggest disabling
                 hardware prefetching in virtualized environments. They
                 claim that prefetching is detrimental to application
                 performance due to inaccurate prediction caused by
                 workload diversity and VM interference on shared cache.
                 However, no comprehensive or quantitative measurements
                 to support this belief have been performed. This paper
                 is the first to systematically measure the influence of
                 hardware prefetching in virtualized environments. We
                 examine a wide variety of benchmarks on three types of
                 chip-multiprocessors (CMPs) to analyze the hardware
                 prefetching performance. We conduct extensive
                 experiments by taking into account a number of
                 important virtualization factors. We find that hardware
                 prefetching has minimal destructive influence under
                 most configurations. Only with certain application
                 combinations does prefetching influence the overall
                 performance. To leverage these findings and make
                 hardware prefetching effective across a diversity of
                 virtualized environments, we propose a dynamic
                 prefetching-aware VCPU-core binding approach (PAVCB),
                 which includes two phases --- classifying and binding.
                 The workload of each VM is classified into different
                 cache sharing constraint categories based upon its
                 cache access characteristics, considering both prefetch
                 requests and demand requests. Then following heuristic
                 rules, the VCPUs of each VM are scheduled onto
                 appropriate cores subject to cache sharing constraints.
                 We show that the proposed approach can improve
                 performance by 12\% on average over the default
                 scheduler and 46\% over manual system administrator
                 bindings across different workload combinations in the
                 presence of hardware prefetching.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Kim:2013:DBC,
  author =       "Hwanju Kim and Sangwook Kim and Jinkyu Jeong and
                 Joonwon Lee and Seungryoul Maeng",
  title =        "Demand-based coordinated scheduling for {SMP VMs}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "369--380",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451156",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As processor architectures have been enhancing their
                 computing capacity by increasing core counts,
                 independent workloads can be consolidated on a single
                 node for the sake of high resource efficiency in data
                 centers. With the prevalence of virtualization
                 technology, each individual workload can be hosted on a
                 virtual machine for strong isolation between co-located
                 workloads. Along with this trend, hosted applications
                 have increasingly been multithreaded to take advantage
                 of improved hardware parallelism. Although the
                 performance of many multithreaded applications highly
                 depends on communication (or synchronization) latency,
                 existing schemes of virtual machine scheduling do not
                 explicitly coordinate virtual CPUs based on their
                 communication behaviors. This paper presents a
                 demand-based coordinated scheduling scheme for
                 consolidated virtual machines that host multithreaded
                 workloads. To this end, we propose communication-driven
                 scheduling that controls time-sharing in response to
                 inter-processor interrupts (IPIs) between virtual CPUs.
                 On the basis of in-depth analysis on the relationship
                 between IPI communications and coordination demands, we
                 devise IPI-driven coscheduling and delayed preemption
                 schemes, which effectively reduce synchronization
                 latency and unnecessary CPU consumption. In addition,
                 we introduce a load-conscious CPU allocation policy in
                 order to address load imbalance in heterogeneously
                 consolidated environments. The proposed schemes are
                 evaluated with respect to various scenarios of mixed
                 workloads using the PARSEC multithreaded applications.
                 In the evaluation, our scheme improves the overall
                 performance of consolidated workloads, especially
                 communication-intensive applications, by reducing
                 inefficient synchronization latency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Dashti:2013:TMH,
  author =       "Mohammad Dashti and Alexandra Fedorova and Justin
                 Funston and Fabien Gaud and Renaud Lachaize and
                 Baptiste Lepers and Vivien Quema and Mark Roth",
  title =        "Traffic management: a holistic approach to memory
                 placement on {NUMA} systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "381--394",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451157",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "NUMA systems are characterized by Non-Uniform Memory
                 Access times, where accessing data in a remote node
                 takes longer than a local access. NUMA hardware has
                 been built since the late 80's, and the operating
                 systems designed for it were optimized for access
                 locality. They co-located memory pages with the threads
                 that accessed them, so as to avoid the cost of remote
                 accesses. Contrary to older systems, modern NUMA
                 hardware has much smaller remote wire delays, and so
                 remote access costs per se are not the main concern for
                 performance, as we discovered in this work. Instead,
                 congestion on memory controllers and interconnects,
                 caused by memory traffic from data-intensive
                 applications, hurts performance a lot more. Because of
                 that, memory placement algorithms must be redesigned to
                 target traffic congestion. This requires an arsenal of
                 techniques that go beyond optimizing locality. In this
                 paper we describe Carrefour, an algorithm that
                 addresses this goal. We implemented Carrefour in Linux
                 and obtained performance improvements of up to 3.6
                 relative to the default kernel, as well as significant
                 improvements compared to NUMA-aware patch sets
                 available for Linux. Carrefour never hurts performance
                 by more than 4\% when memory placement cannot be
                 improved. We present the design of Carrefour, the
                 challenges of implementing it on modern hardware, and
                 draw insights about hardware support that would help
                 optimize system software on future NUMA systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Jog:2013:OCT,
  author =       "Adwait Jog and Onur Kayiran and Nachiappan Chidambaram
                 Nachiappan and Asit K. Mishra and Mahmut T. Kandemir
                 and Onur Mutlu and Ravishankar Iyer and Chita R. Das",
  title =        "{OWL}: cooperative thread array aware scheduling
                 techniques for improving {GPGPU} performance",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "395--406",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451158",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging GPGPU architectures, along with programming
                 models like CUDA and OpenCL, offer a cost-effective
                 platform for many applications by providing high thread
                 level parallelism at lower energy budgets.
                 Unfortunately, for many general-purpose applications,
                 available hardware resources of a GPGPU are not
                 efficiently utilized, leading to lost opportunity in
                 improving performance. A major cause of this is the
                 inefficiency of current warp scheduling policies in
                 tolerating long memory latencies. In this paper, we
                 identify that the scheduling decisions made by such
                 policies are agnostic to thread-block, or cooperative
                 thread array (CTA), behavior, and as a result
                 inefficient. We present a coordinated CTA-aware
                 scheduling policy that utilizes four schemes to
                 minimize the impact of long memory latencies. The first
                 two schemes, CTA-aware two-level warp scheduling and
                 locality aware warp scheduling, enhance per-core
                 performance by effectively reducing cache contention
                 and improving latency hiding capability. The third
                 scheme, bank-level parallelism aware warp scheduling,
                 improves overall GPGPU performance by enhancing DRAM
                 bank-level parallelism. The fourth scheme employs
                 opportunistic memory-side prefetching to further
                 enhance performance by taking advantage of open DRAM
                 rows. Evaluations on a 28-core GPGPU platform with
                 highly memory-intensive applications indicate that our
                 proposed mechanism can provide 33\% average performance
                 improvement compared to the commonly-employed
                 round-robin warp scheduling policy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Pai:2013:IGC,
  author =       "Sreepathi Pai and Matthew J. Thazhuthaveetil and R.
                 Govindarajan",
  title =        "Improving {GPGPU} concurrency with elastic kernels",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "407--418",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451160",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Each new generation of GPUs vastly increases the
                 resources available to GPGPU programs. GPU programming
                 models (like CUDA) were designed to scale to use these
                 resources. However, we find that CUDA programs actually
                 do not scale to utilize all available resources, with
                 over 30\% of resources going unused on average for
                 programs of the Parboil2 suite that we used in our
                 work. Current GPUs therefore allow concurrent execution
                 of kernels to improve utilization. In this work, we
                 study concurrent execution of GPU kernels using
                 multiprogram workloads on current NVIDIA Fermi GPUs. On
                 two-program workloads from the Parboil2 benchmark suite
                 we find concurrent execution is often no better than
                 serialized execution. We identify that the lack of
                 control over resource allocation to kernels is a major
                 serialization bottleneck. We propose transformations
                 that convert CUDA kernels into elastic kernels which
                 permit fine-grained control over their resource usage.
                 We then propose several elastic-kernel aware
                 concurrency policies that offer significantly better
                 performance and concurrency compared to the current
                 CUDA policy. We evaluate our proposals on real hardware
                 using multiprogrammed workloads constructed from
                 benchmarks in the Parboil 2 suite. On average, our
                 proposals increase system throughput (STP) by 1.21x and
                 improve the average normalized turnaround time (ANTT)
                 by 3.73x for two-program workloads when compared to the
                 current CUDA concurrency implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Oh:2013:PAL,
  author =       "Taewook Oh and Hanjun Kim and Nick P. Johnson and Jae
                 W. Lee and David I. August",
  title =        "Practical automatic loop specialization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "419--430",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451161",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Program specialization optimizes a program with
                 respect to program invariants, including known, fixed
                 inputs. These invariants can be used to enable
                 optimizations that are otherwise unsound. In many
                 applications, a program input induces predictable
                 patterns of values across loop iterations, yet existing
                 specializers cannot fully capitalize on this
                 opportunity. To address this limitation, we present
                 Invariant-induced Pattern based Loop Specialization
                 (IPLS), the first fully-automatic specialization
                 technique designed for everyday use on real
                 applications. Using dynamic information-flow tracking,
                 IPLS profiles the values of instructions that depend
                 solely on invariants and recognizes repeating patterns
                 across multiple iterations of hot loops. IPLS then
                 specializes these loops, using those patterns to
                 predict values across a large window of loop
                 iterations. This enables aggressive optimization of the
                 loop; conceptually, this optimization reconstructs
                 recurring patterns induced by the input as concrete
                 loops in the specialized binary. IPLS specializes
                 real-world programs that prior techniques fail to
                 specialize without requiring hints from the user.
                 Experiments demonstrate a geomean speedup of 14.1\%
                 with a maximum speedup of 138\% over the original codes
                 when evaluated on three script interpreters and eleven
                 scripts each.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Phothilimthana:2013:PPH,
  author =       "Phitchaya Mangpo Phothilimthana and Jason Ansel and
                 Jonathan Ragan-Kelley and Saman Amarasinghe",
  title =        "Portable performance on heterogeneous architectures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "431--444",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451162",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Trends in both consumer and high performance computing
                 are bringing not only more cores, but also increased
                 heterogeneity among the computational resources within
                 a single machine. In many machines, one of the greatest
                 computational resources is now their graphics
                 coprocessors (GPUs), not just their primary CPUs. But
                 GPU programming and memory models differ dramatically
                 from conventional CPUs, and the relative performance
                 characteristics of the different processors vary widely
                 between machines. Different processors within a system
                 often perform best with different algorithms and memory
                 usage patterns, and achieving the best overall
                 performance may require mapping portions of programs
                 across all types of resources in the machine. To
                 address the problem of efficiently programming machines
                 with increasingly heterogeneous computational
                 resources, we propose a programming model in which the
                 best mapping of programs to processors and memories is
                 determined empirically. Programs define choices in how
                 their individual algorithms may work, and the compiler
                 generates further choices in how they can map to CPU
                 and GPU processors and memory systems. These choices
                 are given to an empirical autotuning framework that
                 allows the space of possible implementations to be
                 searched at installation time. The rich choice space
                 allows the autotuner to construct poly-algorithms that
                 combine many different algorithmic techniques, using
                 both the CPU and the GPU, to obtain better performance
                 than any one technique alone. Experimental results show
                 that algorithmic changes, and the varied use of both
                 CPUs and GPUs, are necessary to obtain up to a 16.5x
                 speedup over using a single program configuration for
                 all architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Mittal:2013:EVE,
  author =       "Aashish Mittal and Dushyant Bansal and Sorav Bansal
                 and Varun Sethi",
  title =        "Efficient virtualization on embedded {Power
                 Architecture\reg} platforms",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "445--458",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451163",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Power Architecture\reg{} processors are popular and
                 widespread on embedded systems, and such platforms are
                 increasingly being used to run virtual machines. While
                 the Power Architecture meets the Popek-and-Goldberg
                 virtualization requirements for traditional
                 trap-and-emulate style virtualization, the performance
                 overhead of virtualization remains high. For example,
                 workloads exhibiting a large amount of kernel activity
                 typically show 3-5x slowdowns over bare-metal. Recent
                 additions to the Linux kernel contain guest and host
                 side paravirtual extensions for Power Architecture
                 platforms. While these extensions improve performance
                 significantly, they are guest-specific,
                 guest-intrusive, and cover only a subset of all
                 possible virtualization optimizations. We present a set
                 of host-side optimizations that achieve comparable
                 performance to the aforementioned paravirtual
                 extensions, on an unmodified guest. Our optimizations
                 are based on adaptive in-place binary translation.
                 Unlike the paravirtual approach, our solution is guest
                 neutral. We implement our ideas in a prototype based on
                 Qemu/KVM. After our modifications, KVM can boot an
                 unmodified Linux guest around 2.5x faster. We contrast
                 our optimization approach with previous similar binary
                 translation based approaches for the x86 architecture;
                 in our experience, each architecture presents a unique
                 set of challenges and optimization opportunities.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Hill:2013:RDC,
  author =       "Mark D. Hill",
  title =        "Research directions for {21st Century} computer
                 systems: {ASPLOS 2013} panel",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "459--460",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451165",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Four recent efforts call out architectural challenges
                 and opportunities up and down the software/hardware
                 stack. This panel will discuss, ``What should the
                 community do to facilitate, transcend, or refute these
                 partially overlapping visions?'' The panel is chaired
                 by Mark D. Hill with other panel members not finalized
                 for the ASPLOS'13 proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Madhavapeddy:2013:ULO,
  author =       "Anil Madhavapeddy and Richard Mortier and Charalampos
                 Rotsos and David Scott and Balraj Singh and Thomas
                 Gazagnaire and Steven Smith and Steven Hand and Jon
                 Crowcroft",
  title =        "Unikernels: library operating systems for the cloud",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "461--472",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451167",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present unikernels, a new approach to deploying
                 cloud services via applications written in high-level
                 source code. Unikernels are single-purpose appliances
                 that are compile-time specialised into standalone
                 kernels, and sealed against modification when deployed
                 to a cloud platform. In return they offer significant
                 reduction in image sizes, improved efficiency and
                 security, and should reduce operational costs. Our
                 Mirage prototype compiles OCaml code into unikernels
                 that run on commodity clouds and offer an order of
                 magnitude reduction in code size without significant
                 performance penalty. The architecture combines static
                 type-safety with a single address-space layout that can
                 be made immutable via a hypervisor extension. Mirage
                 contributes a suite of type-safe protocol libraries,
                 and our results demonstrate that the hypervisor is a
                 platform that overcomes the hardware compatibility
                 issues that have made past library operating systems
                 impractical to deploy in the real-world.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Kadav:2013:FGF,
  author =       "Asim Kadav and Matthew J. Renzelmann and Michael M.
                 Swift",
  title =        "Fine-grained fault tolerance using device
                 checkpoints",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "473--484",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451168",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recovering faults in drivers is difficult compared to
                 other code because their state is spread across both
                 memory and a device. Existing driver fault-tolerance
                 mechanisms either restart the driver and discard its
                 state, which can break applications, or require an
                 extensive logging mechanism to replay requests and
                 recreate driver state. Even logging may be
                 insufficient, though, if the semantics of requests are
                 ambiguous. In addition, these systems either require
                 large subsystems that must be kept up-to-date as the
                 kernel changes, or require substantial rewriting of
                 drivers. We present a new driver fault-tolerance
                 mechanism that provides fine-grained control over the
                 code protected. Fine-Grained Fault Tolerance (FGFT)
                 isolates driver code at the granularity of a single
                 entry point. It executes driver code as a transaction,
                 allowing roll back if the driver fails. We develop a
                 novel checkpointing mechanism to save and restore
                 device state using existing power management code.
                 Unlike past systems, FGFT can be incrementally deployed
                 in a single driver without the need for a large kernel
                 subsystem, but at the cost of small modifications to
                 the driver. In the evaluation, we show that FGFT can
                 have almost zero runtime cost in many cases, and that
                 checkpoint-based recovery can reduce the duration of a
                 failure by 79\% compared to restarting the driver.
                 Finally, we show that applying FGFT to a driver
                 requires little effort, and the majority of drivers in
                 common classes already contain the power-management
                 code needed for checkpoint/restore.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Silberstein:2013:GIF,
  author =       "Mark Silberstein and Bryan Ford and Idit Keidar and
                 Emmett Witchel",
  title =        "{GPUfs}: integrating a file system with {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "485--498",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451169",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "PU hardware is becoming increasingly general purpose,
                 quickly outgrowing the traditional but constrained
                 GPU-as-coprocessor programming model. To make GPUs
                 easier to program and easier to integrate with existing
                 systems, we propose making the host's file system
                 directly accessible from GPU code. GPUfs provides a
                 POSIX-like API for GPU programs, exploits GPU
                 parallelism for efficiency, and optimizes GPU file
                 access by extending the buffer cache into GPU memory.
                 Our experiments, based on a set of real benchmarks
                 adopted to use our file system, demonstrate the
                 feasibility and benefits of our approach. For example,
                 we demonstrate a simple self-contained GPU program
                 which searches for a set of strings in the entire tree
                 of Linux kernel source files over seven times faster
                 than an eight-core CPU run.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Hunt:2013:DTN,
  author =       "Nicholas Hunt and Tom Bergan and Luis Ceze and Steven
                 D. Gribble",
  title =        "{DDOS}: taming nondeterminism in distributed systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "499--508",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451170",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nondeterminism complicates the development and
                 management of distributed systems, and arises from two
                 main sources: the local behavior of each individual
                 node as well as the behavior of the network connecting
                 them. Taming nondeterminism effectively requires
                 dealing with both sources. This paper proposes DDOS, a
                 system that leverages prior work on deterministic
                 multithreading to offer: (1) space-efficient
                 record/replay of distributed systems; and (2) fully
                 deterministic distributed behavior. Leveraging
                 deterministic behavior at each node makes outgoing
                 messages strictly a function of explicit inputs. This
                 allows us to record the system by logging just
                 message's arrival time, not the contents. Going
                 further, we propose and implement an algorithm that
                 makes all communication between nodes deterministic by
                 scheduling communication onto a global logical
                 timeline. We implement both algorithms in a system
                 called DDOS and evaluate our system with parallel
                 scientific applications, an HTTP/memcached system and a
                 distributed microbenchmark with a high volume of
                 peer-to-peer communication. Our results show up to two
                 orders of magnitude reduction in log size of
                 record/replay, and that distributed systems can be made
                 deterministic with an order of magnitude of overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Wang:2013:TEH,
  author =       "Cheng Wang and Youfeng Wu",
  title =        "{TSO\_ATOMICITY}: efficient hardware primitive for
                 {TSO}-preserving region optimizations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "509--520",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451172",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Program optimizations based on data dependences may
                 not preserve the memory consistency in the programs.
                 Previous works leverage a hardware ATOMICITY primitive
                 to restrict the thread interleaving for preserving
                 sequential consistency in region optimizations.
                 However, ATOMICITY primitive is over restrictive on the
                 thread interleaving for optimizing real-world
                 applications developed with the popular
                 Total-Store-Ordering (TSO) memory consistency, which is
                 weaker than sequential consistency. In this paper, we
                 present a novel hardware TSO\_ATOMICITY primitive,
                 which has less restriction on the thread interleaving
                 than ATOMICITY primitive to permit more efficient
                 program execution than ATOMICITY primitive, but can
                 still preserve TSO memory consistency in all region
                 optimizations. Furthermore, TSO\_ATOMICITY primitive
                 requires similar architecture support as ATOMICITY
                 primitive and can be implemented with only slight
                 change to the existing ATOMICITY primitive
                 implementation. Our experimental results show that in a
                 start-of-art dynamic binary optimization system on a
                 large set of workloads, ATOMICITY primitive can only
                 improve the performance by 4\% on average.
                 TSO\_ATOMICITY primitive can reduce the overhead
                 associated with ATOMICITY primitive and improve the
                 performance by 12\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Jafri:2013:WGI,
  author =       "Syed Ali Raza Jafri and Gwendolyn Voskuilen and T. N.
                 Vijaykumar",
  title =        "{Wait-n-GoTM}: improving {HTM} performance by
                 serializing cyclic dependencies",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "521--534",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451173",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactional memory (TM) has been proposed to
                 alleviate some key programmability problems in chip
                 multiprocessors. Most TMs optimistically allow
                 concurrent transactions, detecting read-write or
                 write-write conflicts. Upon conflicts, existing
                 hardware TMs (HTMs) use one of three
                 conflict-resolution policies: (1) always-abort, (2)
                 always-wait for some conflicting transactions to
                 complete, or (3) always-go past conflicts and resolve
                 acyclic conflicts at commit or abort upon cyclic
                 dependencies. While each policy has advantages, the
                 policies degrade performance under contention by
                 limiting concurrency (always-abort, always-wait) or
                 incurring late aborts due to cyclic dependencies
                 (always-go). Thus, while always-go avoids acyclic
                 aborts, no policy avoids cyclic aborts. We propose
                 Wait-n-GoTM (WnGTM) to increase concurrency while
                 avoiding cyclic aborts. We observe that most cyclic
                 dependencies are caused by threads interleaving
                 multiple accesses to a few heavily-read-write-shared
                 delinquent data cache blocks. These accesses occur in
                 code sections called cycle inducer sections (CISTs).
                 Accordingly, we propose Wait-n-Go (WnG)
                 conflict-resolution to avoid many cyclic aborts by
                 predicting and serializing the CISTs. To support the
                 WnG policy, we extend previous HTMs to (1) allow
                 multiple readers and writers, (2) scalably identify
                 dependencies, and (3) detect cyclic dependencies via
                 new mechanisms, namely, conflict transactional state,
                 order-capture, and hardware timestamps, respectively.
                 In 16-core simulations of STAMP, WnGTM achieves average
                 speedups of 46\% for higher-contention benchmarks and
                 28\% for all benchmarks over always-abort (TokenTM)
                 with low-contention benchmarks remaining unchanged,
                 compared to always-go (DATM) and always-wait
                 (LogTM-SE), which perform worse than and 6\% better
                 than TokenTM, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Qian:2013:VSP,
  author =       "Xuehai Qian and Josep Torrellas and Benjamin Sahelices
                 and Depei Qian",
  title =        "{Volition}: scalable and precise sequential
                 consistency violation detection",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "535--548",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451174",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Sequential Consistency (SC) is the most intuitive
                 memory model, and SC Violations (SCVs) produce
                 unintuitive, typically incorrect executions. Most prior
                 SCV detection schemes have used data races as proxies
                 for SCVs, which is highly imprecise. Other schemes that
                 have targeted data-race cycles are either too
                 conservative or are designed only for two-processor
                 cycles and snoopy-based systems. This paper presents
                 Volition, the first hardware scheme that detects SCVs
                 in a relaxed-consistency machine precisely, in a
                 scalable manner, and for an arbitrary number of
                 processors in the cycle. Volition leverages cache
                 coherence protocol transactions to dynamically detect
                 cycles in memory-access orders across threads. When a
                 cycle is about to occur, an exception is triggered.
                 Volition can be used in both directory- and
                 snoopy-based coherence protocols. Our simulations of
                 Volition in a 64-processor multicore with
                 directory-based coherence running SPLASH-2 and Parsec
                 programs shows that Volition induces negligible traffic
                 and execution overhead. In addition, it can detect SCVs
                 with several processors. Volition is suitable for
                 on-the-fly use.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Grossman:2013:HSF,
  author =       "J. P. Grossman and Jeffrey S. Kuskin and Joseph A.
                 Bank and Michael Theobald and Ron O. Dror and Douglas
                 J. Ierardi and Richard H. Larson and U. Ben Schafer and
                 Brian Towles and Cliff Young and David E. Shaw",
  title =        "Hardware support for fine-grained event-driven
                 computation in {Anton 2}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4",
  pages =        "549--560",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499368.2451175",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:23 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Exploiting parallelism to accelerate a computation
                 typically involves dividing it into many small tasks
                 that can be assigned to different processing elements.
                 An efficient execution schedule for these tasks can be
                 difficult or impossible to determine in advance,
                 however, if there is uncertainty as to when each task's
                 input data will be available. Ideally, each task would
                 run in direct response to the arrival of its input
                 data, thus allowing the computation to proceed in a
                 fine-grained event-driven manner. Realizing this ideal
                 is difficult in practice, and typically requires
                 sacrificing flexibility for performance. In Anton 2, a
                 massively parallel special-purpose supercomputer for
                 molecular dynamics simulations, we addressed this
                 challenge by including a hardware block, called the
                 dispatch unit, that provides flexible and efficient
                 support for fine-grained event-driven computation. Its
                 novel features include a many-to-many mapping from
                 input data to a set of synchronization counters, and
                 the ability to prioritize tasks based on their type. To
                 solve the additional problem of using a fixed set of
                 synchronization counters to track input data for a
                 potentially large number of tasks, we created a
                 software library that allows programmers to treat Anton
                 2 as an idealized machine with infinitely many
                 synchronization counters. The dispatch unit, together
                 with this library, made it possible to simplify our
                 molecular dynamics software by expressing it as a
                 collection of independent tasks, and the resulting
                 fine-grained execution schedule improved overall
                 performance by up to 16\% relative to a coarse-grained
                 schedule for precisely the same computation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Vitek:2013:SCR,
  author =       "Jan Vitek",
  title =        "{SIGPLAN Chair}'s report",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "1--2",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gibbons:2013:ASV,
  author =       "Jeremy Gibbons",
  title =        "{ACM SIGPLAN Vice-Chair}'s report",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "3--3",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502511",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Black:2013:SSR,
  author =       "Andrew P. Black",
  title =        "{SIGPLAN Secretary}'s report",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "4--5",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lopes:2013:STR,
  author =       "Cristina V. Lopes",
  title =        "{SIGPLAN Treasurer}'s report",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "6--6",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502513",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dreyer:2013:SMI,
  author =       "Derek Dreyer",
  title =        "{SIGPLAN} most influential paper awards",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "7--8",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502514",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lawall:2013:SPA,
  author =       "Julia Lawall and Cristina V. Lopes",
  title =        "{SIGPLAN Professional Activities Committee} report",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "9--9",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502515",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hind:2013:CRH,
  author =       "Michael Hind",
  title =        "{CACM} research highlights annual report",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "10--11",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502516",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dreyer:2013:PP,
  author =       "Derek Dreyer and John Field and Roberto Giacobazzi and
                 Michael Hicks and Suresh Jagannathan and Mooly Sagiv
                 and Peter Sewell and Phil Wadler",
  title =        "Principles of {POPL}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "12--16",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502517",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Krishnamurthi:2013:AES,
  author =       "Shriram Krishnamurthi",
  title =        "Artifact evaluation for software conferences",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "17--21",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software and other digital artifacts are amongst the
                 most valuable contributions of computer science. Yet
                 our conferences treat these mostly as second-class
                 artifacts---especially conferences in the software
                 sciences, which ought to know better. This article
                 argues for elevating these other artifacts by making
                 them part of the evaluation process for papers, and
                 reports on experience from an iteration of an Artifact
                 Evaluation Committee for ESEC/FSE 2011.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Flanagan:2013:PES,
  author =       "Cormac Flanagan and K. Rustan M. Leino and Mark
                 Lillibridge and Greg Nelson and James B. Saxe and
                 Raymie Stata",
  title =        "{PLDI 2002}: Extended static checking for {Java}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "22--33",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502520",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software development and maintenance are costly
                 endeavors. The cost can be reduced if more software
                 defects are detected earlier in the development cycle.
                 This paper introduces the Extended Static Checker for
                 Java (ESC/Java), an experimental compile-time program
                 checker that finds common programming errors. The
                 checker is powered by verification-condition generation
                 and automatic theorem-proving techniques. It provides
                 programmers with a simple annotation language with
                 which programmer design decisions can be expressed
                 formally. ESC/Java examines the annotated software and
                 warns of inconsistencies between the design decisions
                 recorded in the annotations and the actual code, and
                 also warns of potential runtime errors in the code.
                 This paper gives an overview of the checker
                 architecture and annotation language and describes our
                 experience applying the checker to tens of thousands of
                 lines of Java programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Findler:2013:ICH,
  author =       "Robert Bruce Findler and Matthias Felleisen",
  title =        "{ICFP 2002}: Contracts for higher-order functions",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "34--45",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502521",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Assertions play an important role in the construction
                 of robust software. Their use in programming languages
                 dates back to the 1970s. Eiffel, an object-oriented
                 programming language, wholeheartedly adopted assertions
                 and developed the ``Design by Contract'' philosophy.
                 Indeed, the entire object-oriented community recognizes
                 the value of assertion-based contracts on methods. In
                 contrast, languages with higher-order functions do not
                 support assertion-based contracts. Because predicates
                 on functions are, in general, undecidable, specifying
                 such predicates appears to be meaningless. Instead, the
                 functional languages community developed type systems
                 that statically approximate interesting predicates. In
                 this paper, we show how to support higher-order
                 function contracts in a theoretically well-founded and
                 practically viable manner. Specifically, we introduce
                 ?{$^{CON}$}, a typed lambda calculus with assertions
                 for higher-order functions. The calculus models the
                 assertion monitoring system that we employ in Dr
                 Scheme. We establish basic properties of the model
                 (type soundness, etc.) and illustrate the usefulness of
                 contract checking with examples from Dr Scheme's code
                 base. We believe that the development of an assertion
                 system for higher-order functions serves two purposes.
                 On one hand, the system has strong practical potential
                 because existing type systems simply cannot express
                 many assertions that programmers would like to state.
                 On the other hand, an inspection of a large base of
                 invariants may provide inspiration for the direction of
                 practical future type system research.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Berger:2013:ORC,
  author =       "Emery D. Berger and Benjamin G. Zorn and Kathryn S.
                 McKinley",
  title =        "{OOPSLA 2002}: Reconsidering custom memory
                 allocation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "46--57",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmers hoping to achieve performance improvements
                 often use custom memory allocators. This in-depth study
                 examines eight applications that use custom allocators.
                 Surprisingly, for six of these applications, a
                 state-of-the-art general-purpose allocator (the Lea
                 allocator) performs as well as or better than the
                 custom allocators. The two exceptions use regions,
                 which deliver higher performance (improvements of up to
                 44\%). Regions also reduce programmer burden and
                 eliminate a source of memory leaks. However, we show
                 that the inability of programmers to free individual
                 objects within regions can lead to a substantial
                 increase in memory consumption. Worse, this limitation
                 precludes the use of regions for common programming
                 idioms, reducing their usefulness. We present a
                 generalization of general-purpose and region-based
                 allocators that we call reaps. Reaps are a combination
                 of regions and heaps, providing a full range of region
                 semantics with the addition of individual object
                 deletion. We show that our implementation of reaps
                 provides high performance, outperforming other
                 allocators with region-like semantics. We then use a
                 case study to demonstrate the space advantages and
                 software engineering benefits of reaps in practice. Our
                 results indicate that programmers needing fast regions
                 should use reaps, and that most programmers considering
                 custom allocators should instead use the Lea
                 allocator.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bacon:2013:PRT,
  author =       "David F. Bacon and Perry Cheng and V. T. Rajan",
  title =        "{POPL 2003}: a real-time garbage collector with low
                 overhead and consistent utilization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "4S",
  pages =        "58--71",
  month =        apr,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2502508.2502523",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 15 15:53:11 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Now that the use of garbage collection in languages
                 like Java is becoming widely accepted due to the safety
                 and software engineering benefits it provides, there is
                 significant interest in applying garbage collection to
                 hard real-time systems. Past approaches have generally
                 suffered from one of two major flaws: either they were
                 not provably real-time, or they imposed large space
                 overheads to meet the real-time bounds. We present a
                 mostly non-moving, dynamically defragmenting collector
                 that overcomes both of these limitations: by avoiding
                 copying in most cases, space requirements are kept low;
                 and by fully incrementalizing the collector we are able
                 to meet real-time bounds. We implemented our algorithm
                 in the Jikes RVM and show that at real-time resolution
                 we are able to obtain mutator utilization rates of 45\%
                 with only 1.6--2.5 times the actual space required by
                 the application, a factor of 4 improvement in
                 utilization over the best previously published results.
                 Defragmentation causes no more than 4\% of the traced
                 data to be copied.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wu:2013:HSC,
  author =       "Youfeng Wu",
  title =        "{HW\slash SW} co-designed acceleration of dynamic
                 languages",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "1--2",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465555",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic Programming Languages, such as Java,
                 JavaScript, PHP, Perl, Python, Ruby, etc., are
                 dominating languages for programming the web. HW/SW
                 co-designed virtual machine can significantly
                 accelerate their executions by transparently leveraging
                 internal HW features via an internal compiler. We also
                 argue for a common API to interface dynamic languages
                 with the HW/SW co-designed virtual machine, so that a
                 single internal compiler can accelerate all major
                 dynamic languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Khudia:2013:LCC,
  author =       "Daya Shanker Khudia and Scott Mahlke",
  title =        "Low cost control flow protection using abstract
                 control signatures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "3--12",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465568",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The continual trend of shrinking feature sizes and
                 reducing voltage levels makes transistors faster and
                 more efficient. However, it also makes them more
                 susceptible to transient hardware faults. Transient
                 faults due to high energy particle strikes or circuit
                 crosstalk can corrupt the output of a program or cause
                 it to crash. Previous studies have reported that as
                 much as 70\% of the transient faults disturb program
                 control flow, making it critical to protect control
                 flow. Traditional approaches employ signatures to check
                 that every control flow transfer in a program is valid.
                 While having high fault coverage, large performance
                 overheads are introduced by such detailed checking. We
                 propose a coarse-grain control flow checking method to
                 detect transient faults in a cost effective way. Our
                 software-only approach is centered on the principle of
                 abstraction: control flow that exhibits simple run-time
                 properties (e.g., proper path length) is almost always
                 completely correct. Our solution targets off-the-shelf
                 commodity embedded systems to provide a low cost
                 protection against transient faults. The proposed
                 technique achieves its efficiency by simplifying
                 signature calculations in each basic block and by
                 performing checking at a coarse-grain level. The
                 coarse-grain signature comparison points are obtained
                 by the use of a region based analysis. In addition, we
                 propose a technique to protect control flow transfers
                 via call and return instructions to ensure all control
                 flow is covered by our technique. Overall, our proposed
                 technique has an average of 11\% performance overhead
                 in comparison to 75\% performance overhead of
                 previously proposed signature based techniques while
                 maintaining approximately the same degree of fault
                 coverage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Chen:2013:BEF,
  author =       "Hao Chen and Chengmo Yang",
  title =        "Boosting efficiency of fault detection and recovery
                 through application-specific comparison and
                 checkpointing",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "13--20",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465562",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "While the unending technology scaling has brought
                 reliability to the forefront of concerns of
                 semiconductor industry, fault tolerance techniques are
                 still rarely incorporated into existing designs due to
                 their high overhead. One fault tolerance scheme that
                 receives a lot of research attention is duplication and
                 checkpointing. However, most of the techniques in the
                 category employ a blind strategy to compare instruction
                 results, therefore not only generating large overhead
                 in buffering and verifying these values, but also
                 inducing unnecessary rollbacks to recover faults that
                 will never influence subsequent execution. To tackle
                 these issues, we introduce in this paper an approach
                 that identifies the minimum set of instruction results
                 for fault detection and checkpointing. For a given
                 application, the proposed technique first identifies
                 the control and data flow information of each execution
                 hotspot, and then selects only the instruction results
                 that either influence the final program results or are
                 needed during re-execution as the comparison set. Our
                 experimental studies demonstrate that the proposed
                 hotspot-targeting technique is able to reduce nearly
                 88\% of the comparison overhead and mask over 38\% of
                 the total injected faults of all the injected faults
                 while at the same time delivering full fault
                 coverage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Stilkerich:2013:JSE,
  author =       "Isabella Stilkerich and Michael Strotz and Christoph
                 Erhardt and Martin Hoffmann and Daniel Lohmann and
                 Fabian Scheler and Wolfgang Schr{\"o}der-Preikschat",
  title =        "A {JVM} for soft-error-prone embedded systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "21--32",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465571",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The reduction of structure sizes in microcontollers,
                 environmental conditions or low supply voltages
                 increase the susceptibility of embedded systems to soft
                 errors. As a result, the employment of fault-detection
                 and fault-tolerance measures is becoming a mandatory
                 task even for moderately critical applications.
                 Accordingly, software-based techniques have recently
                 gained in popularity, and a multitude of approaches
                 that differ in the number and frequency of tolerated
                 errors as well as their associated overhead have been
                 proposed. Using type-safe programming languages to
                 isolate critical software components is very popular
                 among those techniques. An automated application of
                 fault-detection and fault-tolerance measures based on
                 the type system of the programming language and static
                 code analyses is possible. It facilitates an easy
                 evaluation of the protection characteristics and costs,
                 as well as the migration of software to new hardware
                 platforms with different failure rates. Transient
                 faults, however, are not bound to the application code
                 secured by the type system, but can also affect the
                 correctness of the type system itself. Thereby, the
                 type system might lose its ability to isolate critical
                 components. As a consequence, it is essential to also
                 protect the type system itself against soft errors. In
                 this paper, we show how soft errors can affect the
                 integrity of the type system. Furthermore, we provide
                 means to secure it against these faults, thus
                 preserving its isolating character. These measures can
                 be applied selectively to achieve a suitable tradeoff
                 between level of protection and resource consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Finlayson:2013:IPE,
  author =       "Ian Finlayson and Brandon Davis and Peter Gavin and
                 Gang-Ryung Uh and David Whalley and Magnus
                 Sj{\"a}lander and Gary Tyson",
  title =        "Improving processor efficiency by statically
                 pipelining instructions",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "33--44",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465559",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A new generation of applications requires reduced
                 power consumption without sacrificing performance.
                 Instruction pipelining is commonly used to meet
                 application performance requirements, but some
                 implementation aspects of pipelining are inefficient
                 with respect to energy usage. We propose static
                 pipelining as a new instruction set architecture to
                 enable more efficient instruction flow through the
                 pipeline, which is accomplished by exposing the
                 pipeline structure to the compiler. While this approach
                 simplifies hardware pipeline requirements, significant
                 modifications to the compiler are required. This paper
                 describes the code generation and compiler
                 optimizations we implemented to exploit the features of
                 this architecture. We show that we can achieve
                 performance and code size improvements despite a very
                 low-level instruction representation. We also
                 demonstrate that static pipelining of instructions
                 reduces energy usage by simplifying hardware, avoiding
                 many unnecessary operations, and allowing the compiler
                 to perform optimizations that are not possible on
                 traditional architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Porpodas:2013:LLA,
  author =       "Vasileios Porpodas and Marcelo Cintra",
  title =        "{LUCAS}: latency-adaptive unified cluster assignment
                 and instruction scheduling",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "45--54",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465565",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Clustered VLIW architectures are statically scheduled
                 wide-issue architectures that combine the advantages of
                 wide-issue processors along with the power and
                 frequency scalability of clustered designs. Being
                 statically scheduled, they require that the decision of
                 mapping instructions to clusters be done by the
                 compiler. State-of-the-art code generation for such
                 architectures combines cluster-assignment and
                 instruction scheduling in a single unified pass. The
                 performance of the generated code, however, is very
                 susceptible to the inter-cluster communication latency.
                 This is due to the nature of the two clustering
                 heuristics used. One is aggressive and works well for
                 low inter-cluster latencies, while the other is more
                 conservative and works well only for high latencies. In
                 this paper we propose LUCAS, a novel unified
                 cluster-assignment and instruction-scheduling algorithm
                 that adapts to the inter-cluster latency better than
                 the existing state-of-the-art schemes. LUCAS is a
                 hybrid scheme that performs fine-grain switching
                 between the two state-of-the art clustering heuristics,
                 leading to better scheduling than either of them. It
                 generates better performing code for a wide range of
                 inter-cluster latency values.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Jang:2013:PSP,
  author =       "Hakbeom Jang and Channoh Kim and Jae W. Lee",
  title =        "Practical speculative parallelization of
                 variable-length decompression algorithms",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "55--64",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465557",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Variable-length coding is widely used for efficient
                 data compression. Typically, the compressor splits the
                 original data into blocks and compresses each block
                 with variable-length codes, hence producing
                 variable-length compressed blocks. Although the
                 compressor can easily exploit ample block-level
                 parallelism, it is much more difficult to extract such
                 coarse-grain parallelism from the decompressor because
                 a block boundary cannot be located until decompression
                 of the previous block is completed. This paper presents
                 novel algorithms to efficiently predict block
                 boundaries and a runtime system that enables efficient
                 block-level parallel decompression, called SDM. The SDM
                 execution model features speculative pipelining with
                 three stages: Scanner, Decompressor, and Merger. The
                 scanner stage employs a high-confidence prediction
                 algorithm that finds compressed block boundaries
                 without fully decompressing individual blocks. This
                 information is communicated to the parallel
                 decompressor stage in which multiple blocks are
                 decompressed in parallel. The decompressed blocks are
                 merged in order by the merger stage to produce the
                 final output. The SDM runtime is specialized to execute
                 this pipeline correctly and efficiently on
                 resource-constrained embedded platforms. With SDM we
                 effectively parallelize three production-grade
                 variable-length decompression algorithms --- zlib,
                 bzip2, and H.264 --- with maximum speedups of $ 2.50
                 \times $ and $ 8.53 \times $ (and geometric mean
                 speedups of $ 1.96 \times $ and $ 4.04 \times $ ) on
                 4-core and 36-core embedded platforms, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Chattopadhyay:2013:PPS,
  author =       "Sudipta Chattopadhyay and Lee Kee Chong and Abhik
                 Roychoudhury",
  title =        "Program performance spectrum",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "65--76",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465566",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Real-time and embedded applications often need to
                 satisfy several non-functional properties such as
                 timing. Consequently, performance validation is a
                 crucial stage before the deployment of real-time and
                 embedded software. Cache memories are often used to
                 bridge the performance gap between a processor and
                 memory subsystems. As a result, the analysis of caches
                 plays a key role in the performance validation of
                 real-time, embedded software. In this paper, we propose
                 a novel approach to compute the cache performance
                 signature of an entire program. Our technique is based
                 on exploring the input domain through different path
                 programs. Two paths belong to the same path program if
                 they follow the same set of control flow edges but may
                 vary in the iterations of loops encountered. Our
                 experiments with several subject programs show that the
                 different paths grouped into a path program have very
                 similar and often exactly same cache performance. Our
                 path program exploration can be viewed as partitioning
                 the input domain of the program. Each partition is
                 associated with its cache performance and a symbolic
                 formula capturing the set of program inputs which
                 constitutes the partition. We show that such a
                 partitioning technique has wide spread usages in
                 performance prediction, testing, debugging and design
                 space exploration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Moreno:2013:NIP,
  author =       "Carlos Moreno and Sebastian Fischmeister and M. Anwar
                 Hasan",
  title =        "Non-intrusive program tracing and debugging of
                 deployed embedded systems through side-channel
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "77--88",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465570",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "One of the hardest aspects of embedded software
                 development is that of debugging, especially when
                 faulty behavior is observed at the production or
                 deployment stage. Non-intrusive observation of the
                 system's behavior is often insufficient to infer the
                 cause of the problem and identify and fix the bug. In
                 this work, we present a novel approach for
                 non-intrusive program tracing aimed at assisting
                 developers in the task of debugging embedded systems at
                 deployment or production stage, where standard
                 debugging tools are usually no longer available. The
                 technique is rooted in cryptography, in particular the
                 area of side-channel attacks. Our proposed technique
                 expands the scope of these cryptographic techniques so
                 that we recover the sequence of operations from power
                 consumption observations (power traces). To this end,
                 we use digital signal processing techniques (in
                 particular, spectral analysis) combined with pattern
                 recognition techniques to determine blocks of source
                 code being executed given the observed power trace. One
                 of the important highlights of our contribution is the
                 fact that the system works on a standard PC, capturing
                 the power traces through the recording input of the
                 sound card. Experimental results are presented and
                 confirm that the approach is viable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Beemster:2013:RCD,
  author =       "Marcel Beemster",
  title =        "The role of {C} in the dark ages of multi-core",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "89--90",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465556",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Contrary to predictions of its demise, C remains a
                 dominant programming language, especially in embedded
                 systems. Speed and transparency dictate that it will be
                 so for the next decade, despite its supposed
                 unsuitability for programming parallel architectures. A
                 flexible compiler development system is a unique
                 vehicle to bend the C language and its implementation
                 to the developers' will. Using hard-won experience in
                 applying extended versions of C to diverse parallel
                 architectures, C's potential in the dark ages of
                 multi-core programming is examined.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Wang:2013:FHF,
  author =       "Tianzheng Wang and Duo Liu and Yi Wang and Zili Shao",
  title =        "{FTL 2}: a hybrid {\em f\/}lash {\em t\/}ranslation
                 {\em l\/}ayer with logging for write reduction in flash
                 memory",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "91--100",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465563",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "NAND flash memory has been widely used to build
                 embedded devices such as smartphones and solid state
                 drives (SSD) because of its high performance, low power
                 consumption, great shock resistance and small form
                 factor. However, its lifetime and performance are
                 greatly constrained by partial page updates, which will
                 lead to early depletion of free pages and frequent
                 garbage collections. On the one hand, partial page
                 updates are prevalent as a large portion of I/O does
                 not modify file contents drastically. On the other
                 hand, general-purpose cache usually does not
                 specifically consider and eliminate duplicated
                 contents, despite its popularity. In this paper, we
                 propose a hybrid approach called FTL$^2$, which employs
                 both logging and mapping techniques in flash
                 translation layer (FTL), to tackle the endurance
                 problem and performance degradation caused by partial
                 page updates in flash memory. FTL$^2$ logs the latest
                 contents in a high-speed temporary storage, called
                 Content Cache to handle partial page updates.
                 Experimental results show that FTL$^2$ can greatly
                 reduce page writes and postpone garbage collections
                 with a small overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Li:2013:CDW,
  author =       "Qingan Li and Lei Jiang and Youtao Zhang and Yanxiang
                 He and Chun Jason Xue",
  title =        "Compiler directed write-mode selection for high
                 performance low power volatile {PCM}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "101--110",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465564",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Micro-Controller Units (MCUs) are widely adopted
                 ubiquitous computing devices. Due to tight cost and
                 energy constraints, MCUs often integrate very limited
                 internal RAM memory on top of Flash storage, which
                 exposes Flash to heavy write traffic and results in
                 short system lifetime. Architecting emerging Phase
                 Change Memory (PCM) is a promising approach for MCUs
                 due to its fast read speed and long write endurance.
                 However, PCM, especially multi-level cell (MLC) PCM,
                 has long write latency and requires large write energy,
                 which diminishes the benefits of its replacement of
                 traditional Flash. By studying MLC PCM write
                 operations, we observe that writing MLC PCM can take
                 advantages of two write modes --- fast write leaves
                 cells in volatile state, and slow write leaves cells in
                 non-volatile state. In this paper, we propose a
                 compiler directed dual-write (CDDW) scheme that selects
                 the best write mode for each write operation to
                 maximize the overall performance and energy efficiency.
                 Our experimental results show that CDDW reduces dynamic
                 energy by 32.4\%(33.8\%) and improves performance by
                 6.3\%(35.9\%) compared with an all fast(slow) write
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Guan:2013:BBL,
  author =       "Yong Guan and Guohui Wang and Yi Wang and Renhai Chen
                 and Zili Shao",
  title =        "{BLog}: block-level log-block management for {NAND}
                 flash memory storage systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "111--120",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465560",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Log-block-based FTL (Flash Translation Layer) schemes
                 have been widely used to manage NAND flash memory
                 storage systems in industry. In log-block-based FTLs, a
                 few physical blocks called log blocks are used to hold
                 all page updates from a large amount of data blocks.
                 Frequent page updates in log blocks introduce big
                 overhead so log blocks become the system bottleneck. To
                 address this problem, this paper presents a block-level
                 log-block management scheme called BLog (Block-level
                 Log-Block Management). In BLog, with the block level
                 management, the update pages of a data block can be
                 collected together and put into the same log block as
                 much as possible; therefore, we can effectively reduce
                 the associativities of log blocks so as to reduce the
                 garbage collection overhead. We also propose a novel
                 partial merge operation called reduced-order merge by
                 which we can effectively postpone the garbage
                 collection of log blocks so as to maximally utilize
                 valid pages and reduce unnecessary erase operations in
                 log blocks. Based on BLog, we design an FTL called
                 BLogFTL for MLC NAND flash. We conduct experiments on a
                 mixture of real-world and synthetic traces. The
                 experimental results show that our scheme outperforms
                 the previous log-block-based FTLs for MLC NAND flash.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Mehiaoui:2013:TSO,
  author =       "Asma Mehiaoui and Ernest Wozniak and Sara
                 Tucci-Piergiovanni and Chokri Mraidha and Marco {Di
                 Natale} and Haibo Zeng and Jean-Philippe Babau and
                 Laurent Lemarchand and S{\'e}bastien Gerard",
  title =        "A two-step optimization technique for functions
                 placement, partitioning, and priority assignment in
                 distributed systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "121--132",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465572",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern development methodologies from the industry and
                 the academia for complex real-time systems define a
                 stage in which application functions are deployed onto
                 an execution platform. The deployment consists of the
                 placement of functions on a distributed network of
                 nodes, the partitioning of functions in tasks and the
                 scheduling of tasks and messages. None of the existing
                 optimization techniques deal with the three stages of
                 the deployment problem at the same time. In this paper,
                 we present a staged approach towards the efficient
                 deployment of real-time functions based on genetic
                 algorithms and mixed integer linear programming
                 techniques. Application to case studies shows the
                 applicability of the method to industry-size systems
                 and the quality of the obtained solutions when compared
                 to the true optimum for small size examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Bouakaz:2013:BME,
  author =       "Adnan Bouakaz and Jean-Pierre Talpin",
  title =        "Buffer minimization in earliest-deadline first
                 scheduling of dataflow graphs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "133--142",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465558",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Symbolic schedulability analysis of dataflow graphs is
                 the process of synthesizing the timing parameters (i.e.
                 periods, phases, and deadlines) of actors so that the
                 task system is schedulable and achieves a high
                 throughput when using a specific scheduling policy.
                 Furthermore, the resulted schedule must ensure that
                 communication buffers are underflow- and overflow-free.
                 This paper describes a (partitioned) earliest-deadline
                 first symbolic schedulability analysis of dataflow
                 graphs that minimizes the buffering requirements. Our
                 scheduling analysis consists of three major steps. (1)
                 The construction of an abstract affine schedule of the
                 graph that excludes overflow and underflow exceptions
                 and minimizes the buffering requirements assuming some
                 precedences between jobs. (2) Symbolic deadlines
                 adjustment that guarantees precedences without the need
                 for lock-based synchronizations. (3) The concretization
                 of the affine schedule using a symbolic,
                 fast-converging, processor-demand analysis for both
                 uniprocessor and multiprocessor systems. Experimental
                 results show that our technique improves the buffering
                 requirements in many cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Geuns:2013:ADM,
  author =       "Stefan J. Geuns and Joost P. H. M. Hausmans and Marco
                 J. G. Bekooij",
  title =        "Automatic dataflow model extraction from modal
                 real-time stream processing applications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "143--152",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465561",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many real-time stream processing applications are
                 initially described as a sequential application
                 containing while-loops, which execute for an unknown
                 number of iterations. These modal applications have to
                 be executed in parallel on an MPSoC system in order to
                 meet their real-time throughput constraints. However,
                 no suitable approach exists that can automatically
                 derive a temporal analysis model from a sequential
                 specification containing while- loops with an unknown
                 number of iterations. This paper introduces an approach
                 to the automatic generation of a Structured
                 Variable-rate Phased Dataflow (SVPDF) model from a
                 sequential specification of a modal application. The
                 real-time requirements of an application can be
                 analyzed despite the presence of while-loops with an
                 unknown number of iterations. It is shown that an
                 algorithm that has a polynomial time computational
                 complexity can be applied on the generated SVPDF model
                 to determine whether a throughput constraint can be
                 met. The enabler for the automatic generation of an
                 SVPDF model is the decoupling of synchronization
                 between tasks that contain different while-loops. A
                 DVB-T radio transceiver illustrates the derivation of
                 the SVPDF model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Wang:2013:PMO,
  author =       "Cheng Wang and Sunita Chandrasekaran and Peng Sun and
                 Barbara Chapman and Jim Holt",
  title =        "Portable mapping of {openMP} to multicore embedded
                 systems using {MCA APIs}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "153--162",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multicore embedded systems are being widely used in
                 telecommunication systems, robotics, medical
                 applications and more.While they offer a
                 high-performance with low-power solution, programming
                 in an efficient way is still a challenge. In order to
                 exploit the capabilities that the hardware offers,
                 software developers are expected to handle many of the
                 low-level details of programming including utilizing
                 DMA, ensuring cache coherency, and inserting
                 synchronization primitives explicitly. The
                 state-of-the-art involves solutions where the software
                 toolchain is too vendor-specific thus tying the
                 software to a particular hardware leaving no room-for
                 portability. In this paper we present a runtime system
                 to explore mapping a high-level programming model,
                 OpenMP, on to multicore embedded systems. A key feature
                 of our scheme is that unlike the existing approaches
                 that largely rely on POSIX threads, our approach
                 leverages the Multicore Association (MCA) APIs as an
                 OpenMP translation layer. The MCA APIs is a set of
                 low-level APIs handling resource management,
                 inter-process communications and task scheduling for
                 multicore embedded systems. By deploying the MCA APIs,
                 our runtime is able to effectively capture the
                 characteristics of multicore embedded systems compared
                 with the POSIX threads. Furthermore, the MCA layer
                 enables our runtime implementation to be portable
                 across various architectures. Thus programmers only
                 need to maintain a single OpenMP code base which is
                 compatible by various compilers, while on the other
                 hand, the code is portable across different possible
                 types of platforms. We have evaluated our runtime
                 system using several embedded benchmarks. The
                 experiments demonstrate promising and competitive
                 performance compared to the native approach for the
                 platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Huber:2013:CWA,
  author =       "Benedikt Huber and Daniel Prokesch and Peter
                 Puschner",
  title =        "Combined {WCET} analysis of bitcode and machine code
                 using control-flow relation graphs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "5",
  pages =        "163--172",
  month =        may,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499369.2465567",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:32 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Static program analyses like stack usage analysis and
                 worst-case execution time (WCET) analysis depend on the
                 actual machine code generated by the compiler for the
                 target system. As the analysis of binary code is
                 costly, hard to diagnose and platform dependent, it is
                 preferable to carry out parts of these analyses on a
                 higher-level program representation. To this end, the
                 higher-level code and the machine code need to be
                 related, a difficult task due to the complexity of
                 modern optimizing compilers. In this article, we
                 present a novel representation called control-flow
                 relation graphs, which provide an accurate model of the
                 control-flow relation between machine code and the
                 compiler's intermediate representation. In order to
                 facilitate the integration of our approach in existing
                 compiler frameworks, we develop a construction
                 algorithm that builds the control-flow relation graph
                 from partial mappings provided by the compiler. The
                 WCET calculation method for control-flow relation
                 graphs processes flow information from both the
                 intermediate representation and machine code.
                 Furthermore, we demonstrate the transformation of flow
                 information from the IR to the machine code level, in
                 order to use existing industrial-strength WCET analysis
                 tools operating on machine code. We implemented the
                 construction algorithm within the LLVM compiler
                 framework, along with an implementation of the combined
                 WCET calculation method. The evaluation demonstrates
                 that the approach is able to relate bitcode (LLVM's
                 intermediate representation) and machine code in a
                 precise way, with a WCET increase of at most 2\% when
                 using flow facts on the bitcode level, compared to
                 equivalent ones on the machine-code level. As the
                 methods presented in this article provide a
                 cost-effective way to reuse platform independent flow
                 information, they have the potential to simplify WCET
                 analysis, and popularize its use in the development
                 process of real-time systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '12 conference proceedings.",
}

@Article{Smaragdakis:2013:LYF,
  author =       "Yannis Smaragdakis",
  title =        "Look up!: your future is in the cloud",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "1--2",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462157",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The ``Cloud'' is a wonderfully expansive phrase used
                 to denote computation and data storage centralized in a
                 large datacenter and elastically accessed across a
                 network. The concept is not new; web sites and business
                 servers have run in datacenters for a long time. These,
                 however, were specialized applications, outside of the
                 mainstream of desktop programs. The past few years has
                 seen enormous change as the mainstream shifts from a
                 single computer to mobile devices and clusters of
                 computers. Three factors are driving this change. (1)
                 Mobile computing, where apps run on a size- and
                 power-constrained device and would be far less
                 interesting without backend systems to augment
                 computation and storage capacity. (2) Big data, which
                 uses clusters of computers to extract valuable
                 information from vast amounts of unstructured data. (3)
                 Inexpensive, elastic computing, pioneered by Amazon Web
                 Services, which enables everyone to rapidly obtain and
                 use many servers. As a researcher from the language and
                 compiler community, I firmly believe this sea change is
                 at heart a programming problem. Cloud computing is far
                 different from the environment in which most of today's
                 languages and tools were developed, and few programmers
                 have mastered its complexity. New challenges include
                 pervasive parallelism, partial failure, high and
                 variable communication latency, and replication for
                 reliability and throughput.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Cheung:2013:ODB,
  author =       "Alvin Cheung and Armando Solar-Lezama and Samuel
                 Madden",
  title =        "Optimizing database-backed applications with query
                 synthesis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "3--14",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462180",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Object-relational mapping libraries are a popular way
                 for applications to interact with databases because
                 they provide transparent access to the database using
                 the same language as the application. Unfortunately,
                 using such frameworks often leads to poor performance,
                 as modularity concerns encourage developers to
                 implement relational operations in application code.
                 Such application code does not take advantage of the
                 optimized relational implementations that database
                 systems provide, such as efficient implementations of
                 joins or push down of selection predicates. In this
                 paper we present QBS, a system that automatically
                 transforms fragments of application logic into SQL
                 queries. QBS differs from traditional compiler
                 optimizations as it relies on synthesis technology to
                 generate invariants and postconditions for a code
                 fragment. The postconditions and invariants are
                 expressed using a new theory of ordered relations that
                 allows us to reason precisely about both the contents
                 and order of the records produced complex code
                 fragments that compute joins and aggregates. The theory
                 is close in expressiveness to SQL, so the synthesized
                 postconditions can be readily translated to SQL
                 queries. Using 75 code fragments automatically
                 extracted from over 120k lines of open-source code
                 written using the Java Hibernate ORM, we demonstrate
                 that our approach can convert a variety of imperative
                 constructs into relational specifications and
                 significantly improve application performance
                 asymptotically by orders of magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Singh:2013:AFG,
  author =       "Rishabh Singh and Sumit Gulwani and Armando
                 Solar-Lezama",
  title =        "Automated feedback generation for introductory
                 programming assignments",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "15--26",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462195",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new method for automatically providing
                 feedback for introductory programming problems. In
                 order to use this method, we need a reference
                 implementation of the assignment, and an error model
                 consisting of potential corrections to errors that
                 students might make. Using this information, the system
                 automatically derives minimal corrections to student's
                 incorrect solutions, providing them with a measure of
                 exactly how incorrect a given solution was, as well as
                 feedback about what they did wrong. We introduce a
                 simple language for describing error models in terms of
                 correction rules, and formally define a rule-directed
                 translation strategy that reduces the problem of
                 finding minimal corrections in an incorrect program to
                 the problem of synthesizing a correct program from a
                 sketch. We have evaluated our system on thousands of
                 real student attempts obtained from the Introduction to
                 Programming course at MIT (6.00) and MITx (6.00x). Our
                 results show that relatively simple error models can
                 correct on average 64\% of all incorrect submissions in
                 our benchmark set.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Gvero:2013:CCU,
  author =       "Tihomir Gvero and Viktor Kuncak and Ivan Kuraj and
                 Ruzica Piskac",
  title =        "Complete completion using types and weights",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "27--38",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462192",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Developing modern software typically involves
                 composing functionality from existing libraries. This
                 task is difficult because libraries may expose many
                 methods to the developer. To help developers in such
                 scenarios, we present a technique that synthesizes and
                 suggests valid expressions of a given type at a given
                 program point. As the basis of our technique we use
                 type inhabitation for lambda calculus terms in long
                 normal form. We introduce a succinct representation for
                 type judgements that merges types into equivalence
                 classes to reduce the search space, then reconstructs
                 any desired number of solutions on demand. Furthermore,
                 we introduce a method to rank solutions based on
                 weights derived from a corpus of code. We implemented
                 the algorithm and deployed it as a plugin for the
                 Eclipse IDE for Scala. We show that the techniques we
                 incorporated greatly increase the effectiveness of the
                 approach. Our evaluation benchmarks are code examples
                 from programming practice; we make them available for
                 future comparisons.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Johnson:2013:FCP,
  author =       "Nick P. Johnson and Taewook Oh and Ayal Zaks and David
                 I. August",
  title =        "Fast condensation of the program dependence graph",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "39--50",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2491960",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Aggressive compiler optimizations are formulated
                 around the Program Dependence Graph (PDG). Many
                 techniques, including loop fission and parallelization
                 are concerned primarily with dependence cycles in the
                 PDG. The Directed Acyclic Graph of Strongly Connected
                 Components (DAGSCC) represents these cycles directly.
                 The naive method to construct the DAGSCC first computes
                 the full PDG. This approach limits adoption of
                 aggressive optimizations because the number of analysis
                 queries grows quadratically with program size, making
                 DAGSCC construction expensive. Consequently, compilers
                 optimize small scopes with weaker but faster analyses.
                 We observe that many PDG edges do not affect the DAGSCC
                 and that ignoring them cannot affect clients of the
                 DAGSCC. Exploiting this insight, we present an
                 algorithm to omit those analysis queries to compute the
                 DAGSCC efficiently. Across 366 hot loops from 20
                 SPEC2006 benchmarks, this method computes the DAGSCC in
                 half of the time using half as many queries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{ElWazeer:2013:SVD,
  author =       "Khaled ElWazeer and Kapil Anand and Aparna Kotha and
                 Matthew Smithson and Rajeev Barua",
  title =        "Scalable variable and data type detection in a binary
                 rewriter",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "51--60",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462165",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present scalable static analyses to recover
                 variables, data types, and function prototypes from
                 stripped x86 executables (without symbol or debug
                 information) and obtain a functional intermediate
                 representation (IR) for analysis and rewriting
                 purposes. Our techniques on average run $ 352 \times $
                 faster than current techniques and still have the same
                 precision. This enables analyzing executables as large
                 as millions of instructions in minutes which is not
                 possible using existing techniques. Our techniques can
                 recover variables allocated to the floating point
                 stack, unlike current techniques. We have integrated
                 our techniques to obtain a compiler level IR that works
                 correctly if recompiled and produces the same output as
                 the input executable. We demonstrate scalability,
                 precision and correctness of our proposed techniques by
                 evaluating them on the complete SPEC2006 benchmarks
                 suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Rajaram:2013:FRT,
  author =       "Bharghava Rajaram and Vijay Nagarajan and Susmit
                 Sarkar and Marco Elver",
  title =        "Fast {RMWs} for {TSO}: semantics and implementation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "61--72",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462196",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Read-Modify-Write (RMW) instructions are widely used
                 as the building blocks of a variety of higher level
                 synchronization constructs, including locks, barriers,
                 and lock-free data structures. Unfortunately, they are
                 expensive in architectures such as x86 and SPARC which
                 enforce (variants of) Total-Store-Order (TSO). A key
                 reason is that RMWs in these architectures are ordered
                 like a memory barrier, incurring the cost of a
                 write-buffer drain in the critical path. Such strong
                 ordering semantics are dictated by the requirements of
                 the strict atomicity definition (type-1) that existing
                 TSO RMWs use. Programmers often do not need such strong
                 semantics. Besides, weakening the atomicity definition
                 of TSO RMWs, would also weaken their ordering ---
                 thereby leading to more efficient hardware
                 implementations. In this paper we argue for TSO RMWs to
                 use weaker atomicity definitions --- we consider two
                 weaker definitions: type-2 and type-3, with different
                 relaxed ordering differences. We formally specify how
                 such weaker RMWs would be ordered, and show that type-2
                 RMWs, in particular, can seamlessly replace existing
                 type-1 RMWs in common synchronization idioms --- except
                 in situations where a type-1 RMW is used as a memory
                 barrier. Recent work has shown that the new C/C++11
                 concurrency model can be realized by generating
                 conventional (type-1) RMWs for C/C++11 SC-atomic-writes
                 and/or SC-atomic-reads. We formally prove that this is
                 equally valid using the proposed type-2 RMWs; type-3
                 RMWs, on the other hand, could be used for
                 SC-atomic-reads (and optionally SC-atomic-writes). We
                 further propose efficient microarchitectural
                 implementations for type-2 (type-3) RMWs --- simulation
                 results show that our implementation reduces the cost
                 of an RMW by up to 58.9\% (64.3\%), which translates
                 into an overall performance improvement of up to 9.0\%
                 (9.2\%) on a set of parallel programs, including those
                 from the SPLASH-2, PARSEC, and STAMP benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Gordon:2013:RGR,
  author =       "Colin S. Gordon and Michael D. Ernst and Dan
                 Grossman",
  title =        "Rely-guarantee references for refinement types over
                 aliased mutable data",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "73--84",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462160",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reasoning about side effects and aliasing is the heart
                 of verifying imperative programs. Unrestricted side
                 effects through one reference can invalidate
                 assumptions about an alias. We present a new type
                 system approach to reasoning about safe assumptions in
                 the presence of aliasing and side effects, unifying
                 ideas from reference immutability type systems and
                 rely-guarantee program logics. Our approach,
                 rely-guarantee references, treats multiple references
                 to shared objects similarly to multiple threads in
                 rely-guarantee program logics. We propose statically
                 associating rely and guarantee conditions with
                 individual references to shared objects. Multiple
                 aliases to a given object may coexist only if the
                 guarantee condition of each alias implies the rely
                 condition for all other aliases. We demonstrate that
                 existing reference immutability type systems are
                 special cases of rely-guarantee references. In addition
                 to allowing precise control over state modification,
                 rely-guarantee references allow types to depend on
                 mutable data while still permitting flexible aliasing.
                 Dependent types whose denotation is stable over the
                 actions of the rely and guarantee conditions for a
                 reference and its data will not be invalidated by any
                 action through any alias. We demonstrate this with
                 refinement (subset) types that may depend on mutable
                 data. As a special case, we derive the first reference
                 immutability type system with dependent types over
                 immutable data. We show soundness for our approach and
                 describe experience using rely-guarantee references in
                 a dependently-typed monadic DSL in Coq.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Titzer:2013:HCF,
  author =       "Ben L. Titzer",
  title =        "Harmonizing classes, functions, tuples, and type
                 parameters in {Virgil III}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "85--94",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2491962",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Languages are becoming increasingly multi-paradigm.
                 Subtype polymorphism in statically-typed
                 object-oriented languages is being supplemented with
                 parametric polymorphism in the form of generics.
                 Features like first-class functions and lambdas are
                 appearing everywhere. Yet existing languages like Java,
                 C\#, C++, D, and Scala seem to accrete ever more
                 complexity when they reach beyond their original
                 paradigm into another; inevitably older features have
                 some rough edges that lead to nonuniformity and
                 pitfalls. Given a fresh start, a new language designer
                 is faced with a daunting array of potential features.
                 Where to start? What is important to get right first,
                 and what can be added later? What features must work
                 together, and what features are orthogonal? We report
                 on our experience with Virgil III, a practical language
                 with a careful balance of classes, functions, tuples
                 and type parameters. Virgil intentionally lacks many
                 advanced features, yet we find its core feature set
                 enables new species of design patterns that bridge
                 multiple paradigms and emulate features not directly
                 supported such as interfaces, abstract data types, ad
                 hoc polymorphism, and variant types. Surprisingly, we
                 find variance for function types and tuple types often
                 replaces the need for other kinds of type variance when
                 libraries are designed in a more functional style.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Burckhardt:2013:ACF,
  author =       "Sebastian Burckhardt and Manuel Fahndrich and Peli de
                 Halleux and Sean McDirmid and Michal Moskal and Nikolai
                 Tillmann and Jun Kato",
  title =        "{It}'s alive! {Continuous} feedback in {UI}
                 programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "95--104",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462170",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Live programming allows programmers to edit the code
                 of a running program and immediately see the effect of
                 the code changes. This tightening of the traditional
                 edit-compile-run cycle reduces the cognitive gap
                 between program code and execution, improving the
                 learning experience of beginning programmers while
                 boosting the productivity of seasoned ones.
                 Unfortunately, live programming is difficult to realize
                 in practice as imperative languages lack well-defined
                 abstraction boundaries that make live programming
                 responsive or its feedback comprehensible. This paper
                 enables live programming for user interface programming
                 by cleanly separating the rendering and non-rendering
                 aspects of a UI program, allowing the display to be
                 refreshed on a code change without restarting the
                 program. A type and effect system formalizes this
                 separation and provides an evaluation model that
                 incorporates the code update step. By putting live
                 programming on a more formal footing, we hope to enable
                 critical and technical discussion of live programming
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{DeVito:2013:TMS,
  author =       "Zachary DeVito and James Hegarty and Alex Aiken and
                 Pat Hanrahan and Jan Vitek",
  title =        "{Terra}: a multi-stage language for high-performance
                 computing",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "105--116",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462166",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "High-performance computing applications, such as
                 auto-tuners and domain-specific languages, rely on
                 generative programming techniques to achieve high
                 performance and portability. However, these systems are
                 often implemented in multiple disparate languages and
                 perform code generation in a separate process from
                 program execution, making certain optimizations
                 difficult to engineer. We leverage a popular scripting
                 language, Lua, to stage the execution of a novel
                 low-level language, Terra. Users can implement
                 optimizations in the high-level language, and use
                 built-in constructs to generate and execute
                 high-performance Terra code. To simplify
                 meta-programming, Lua and Terra share the same lexical
                 environment, but, to ensure performance, Terra code can
                 execute independently of Lua's runtime. We evaluate our
                 design by reimplementing existing multi-language
                 systems entirely in Terra. Our Terra-based auto-tuner
                 for BLAS routines performs within 20\% of ATLAS, and
                 our DSL for stencil computations runs 2.3x faster than
                 hand-written C.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Li:2013:SIA,
  author =       "Jiajia Li and Guangming Tan and Mingyu Chen and
                 Ninghui Sun",
  title =        "{SMAT}: an input adaptive auto-tuner for sparse
                 matrix-vector multiplication",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "117--126",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462181",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Sparse Matrix Vector multiplication (SpMV) is an
                 important kernel in both traditional high performance
                 computing and emerging data-intensive applications. By
                 far, SpMV libraries are optimized by either
                 application-specific or architecture-specific
                 approaches, making the libraries become too complicated
                 to be used extensively in real applications. In this
                 work we develop a Sparse Matrix-vector multiplication
                 Auto-Tuning system (SMAT) to bridge the gap between
                 specific optimizations and general-purpose usage. SMAT
                 provides users with a unified programming interface in
                 compressed sparse row (CSR) format and automatically
                 determines the optimal format and implementation for
                 any input sparse matrix at runtime. For this purpose,
                 SMAT leverages a learning model, which is generated in
                 an off-line stage by a machine learning method with a
                 training set of more than 2000 matrices from the UF
                 sparse matrix collection, to quickly predict the best
                 combination of the matrix feature parameters. Our
                 experiments show that SMAT achieves impressive
                 performance of up to 51GFLOPS in single-precision and
                 37GFLOPS in double-precision on mainstream x86
                 multi-core processors, which are both more than 3 times
                 faster than the Intel MKL library. We also demonstrate
                 its adaptability in an algebraic multigrid solver from
                 Hypre library with above 20\% performance improvement
                 reported.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Kong:2013:WPT,
  author =       "Martin Kong and Richard Veras and Kevin Stock and
                 Franz Franchetti and Louis-No{\"e}l Pouchet and P.
                 Sadayappan",
  title =        "When polyhedral transformations meet {SIMD} code
                 generation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "127--138",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462187",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data locality and parallelism are critical
                 optimization objectives for performance on modern
                 multi-core machines. Both coarse-grain parallelism
                 (e.g., multi-core) and fine-grain parallelism (e.g.,
                 vector SIMD) must be effectively exploited, but despite
                 decades of progress at both ends, current compiler
                 optimization schemes that attempt to address data
                 locality and both kinds of parallelism often fail at
                 one of the three objectives. We address this problem by
                 proposing a 3-step framework, which aims for integrated
                 data locality, multi-core parallelism and SIMD
                 execution of programs. We define the concept of
                 vectorizable codelets, with properties tailored to
                 achieve effective SIMD code generation for the
                 codelets. We leverage the power of a modern high-level
                 transformation framework to restructure a program to
                 expose good ISA-independent vectorizable codelets,
                 exploiting multi-dimensional data reuse. Then, we
                 generate ISA-specific customized code for the codelets,
                 using a collection of lower-level SIMD-focused
                 optimizations. We demonstrate our approach on a
                 collection of numerical kernels that we automatically
                 tile, parallelize and vectorize, exhibiting significant
                 performance improvements over existing compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Schneider:2013:PLS,
  author =       "Fred B. Schneider",
  title =        "Programming languages in security: keynote",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "139--140",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462158",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Huang:2013:CRL,
  author =       "Jeff Huang and Charles Zhang and Julian Dolby",
  title =        "{CLAP}: recording local executions to reproduce
                 concurrency failures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "141--152",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462167",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present CLAP, a new technique to reproduce
                 concurrency bugs. CLAP has two key steps. First, it
                 logs thread local execution paths at runtime. Second,
                 offline, it computes memory dependencies that accord
                 with the logged execution and are able to reproduce the
                 observed bug. The second step works by combining
                 constraints from the thread paths and constraints based
                 on a memory model, and computing an execution with a
                 constraint solver. CLAP has four major advantages.
                 First, logging purely local execution of each thread is
                 substantially cheaper than logging memory interactions,
                 which enables CLAP to be efficient compared to previous
                 approaches. Second, our logging does not require any
                 synchronization and hence with no added memory barriers
                 or fences; this minimizes perturbation and missed bugs
                 due to extra synchronizations foreclosing certain racy
                 behaviors. Third, since it uses no synchronization, we
                 extend CLAP to work on a range of relaxed memory
                 models, such as TSO and PSO, in addition to sequential
                 consistency. Fourth, CLAP can compute a much simpler
                 execution than the original one, that reveals the bug
                 with minimal thread context switches. To mitigate the
                 scalability issues, we also present an approach to
                 parallelize constraint solving, which theoretically
                 scales our technique to programs with arbitrary
                 execution length. Experimental results on a variety of
                 multithreaded benchmarks and real world concurrent
                 applications validate these advantages by showing that
                 our technique is effective in reproducing concurrency
                 bugs even under relaxed memory models; furthermore, it
                 is significantly more efficient than a state-of-the-art
                 technique that records shared memory dependencies,
                 reducing execution time overhead by 45\% and log size
                 by 88\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Elmas:2013:CDS,
  author =       "Tayfun Elmas and Jacob Burnim and George Necula and
                 Koushik Sen",
  title =        "{CONCURRIT}: a domain specific language for
                 reproducing concurrency bugs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "153--164",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462162",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present CONCURRIT, a domain-specific language (DSL)
                 for reproducing concurrency bugs. Given some partial
                 information about the nature of a bug in an
                 application, a programmer can write a CONCURRIT script
                 to formally and concisely specify a set of thread
                 schedules to explore in order to find a schedule
                 exhibiting the bug. Further, the programmer can specify
                 how these thread schedules should be searched to find a
                 schedule that reproduces the bug. We implemented
                 CONCURRIT as an embedded DSL in C++, which uses manual
                 or automatic source instrumentation to partially
                 control the scheduling of the software under test.
                 Using CONCURRIT, we were able to write concise tests to
                 reproduce concurrency bugs in a variety of benchmarks,
                 including the Mozilla's SpiderMonkey JavaScript engine,
                 Memcached, Apache's HTTP server, and MySQL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Schaefer:2013:DDA,
  author =       "Max Sch{\"a}efer and Manu Sridharan and Julian Dolby
                 and Frank Tip",
  title =        "Dynamic determinacy analysis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "165--174",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462168",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an analysis for identifying determinate
                 variables and expressions that always have the same
                 value at a given program point. This information can be
                 exploited by client analyses and tools to, e.g.,
                 identify dead code or specialize uses of dynamic
                 language constructs such as eval, replacing them with
                 equivalent static constructs. Our analysis is
                 completely dynamic and only needs to observe a single
                 execution of the program, yet the determinacy facts it
                 infers hold for any execution. We present a formal
                 soundness proof of the analysis for a simple imperative
                 language, and a prototype implementation that handles
                 full JavaScript. Finally, we report on two case studies
                 that explored how static analysis for JavaScript could
                 leverage the information gathered by dynamic
                 determinacy analysis. We found that in some cases
                 scalability of static pointer analysis was improved
                 dramatically, and that many uses of runtime code
                 generation could be eliminated.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Zhao:2013:FVS,
  author =       "Jianzhou Zhao and Santosh Nagarakatte and Milo M. K.
                 Martin and Steve Zdancewic",
  title =        "Formal verification of {SSA}-based optimizations for
                 {LLVM}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "175--186",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462164",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern compilers, such as LLVM and GCC, use a static
                 single assignment (SSA) intermediate representation
                 (IR) to simplify and enable many advanced
                 optimizations. However, formally verifying the
                 correctness of SSA-based optimizations is challenging
                 because SSA properties depend on a function's entire
                 control-flow graph. This paper addresses this challenge
                 by developing a proof technique for proving SSA-based
                 program invariants and compiler optimizations. We use
                 this technique in the Coq proof assistant to create
                 mechanized correctness proofs of several ``micro''
                 transformations that form the building blocks for
                 larger SSA optimizations. To demonstrate the utility of
                 this approach, we formally verify a variant of LLVM's
                 mem2reg transformation in Vellvm, a Coq-based formal
                 semantics of the LLVM IR. The extracted implementation
                 generates code with performance comparable to that of
                 LLVM's unverified implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Morisset:2013:CTT,
  author =       "Robin Morisset and Pankaj Pawan and Francesco Zappa
                 Nardelli",
  title =        "Compiler testing via a theory of sound optimisations
                 in the {C11\slash C++11} memory model",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "187--196",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2491967",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compilers sometimes generate correct sequential code
                 but break the concurrency memory model of the
                 programming language: these subtle compiler bugs are
                 observable only when the miscompiled functions interact
                 with concurrent contexts, making them particularly hard
                 to detect. In this work we design a strategy to reduce
                 the hard problem of hunting concurrency compiler bugs
                 to differential testing of sequential code and build a
                 tool that puts this strategy to work. Our first
                 contribution is a theory of sound optimisations in the
                 C11/C++11 memory model, covering most of the
                 optimisations we have observed in real compilers and
                 validating the claim that common compiler optimisations
                 are sound in the C11/C++11 memory model. Our second
                 contribution is to show how, building on this theory,
                 concurrency compiler bugs can be identified by
                 comparing the memory trace of compiled code against a
                 reference memory trace for the source code. Our tool
                 identified several mistaken write introductions and
                 other unexpected behaviours in the latest release of
                 the gcc compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Chen:2013:TCF,
  author =       "Yang Chen and Alex Groce and Chaoqiang Zhang and
                 Weng-Keen Wong and Xiaoli Fern and Eric Eide and John
                 Regehr",
  title =        "Taming compiler fuzzers",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "197--208",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462173",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Aggressive random testing tools (``fuzzers'') are
                 impressively effective at finding compiler bugs. For
                 example, a single test-case generator has resulted in
                 more than 1,700 bugs reported for a single JavaScript
                 engine. However, fuzzers can be frustrating to use:
                 they indiscriminately and repeatedly find bugs that may
                 not be severe enough to fix right away. Currently,
                 users filter out undesirable test cases using ad hoc
                 methods such as disallowing problematic features in
                 tests and grepping test results. This paper formulates
                 and addresses the fuzzer taming problem: given a
                 potentially large number of random test cases that
                 trigger failures, order them such that diverse,
                 interesting test cases are highly ranked. Our
                 evaluation shows our ability to solve the fuzzer taming
                 problem for 3,799 test cases triggering 46 bugs in a C
                 compiler and 2,603 test cases triggering 28 bugs in a
                 JavaScript engine.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Blackshear:2013:ACS,
  author =       "Sam Blackshear and Shuvendu K. Lahiri",
  title =        "Almost-correct specifications: a modular semantic
                 framework for assigning confidence to warnings",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "209--218",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462188",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modular assertion checkers are plagued with false
                 alarms due to the need for precise environment
                 specifications (preconditions and callee
                 postconditions). Even the fully precise checkers report
                 assertion failures under the most demonic environments
                 allowed by unconstrained or partial specifications. The
                 inability to preclude overly adversarial environments
                 makes such checkers less attractive to developers and
                 severely limits the adoption of such tools in the
                 development cycle. In this work, we propose a
                 parameterized framework for prioritizing the assertion
                 failures reported by a modular verifier, with the goal
                 of suppressing warnings from overly demonic
                 environments. We formalize it almost-correct
                 specifications as the minimal weakening of an angelic
                 specification (over a set of predicates) that precludes
                 any dead code intraprocedurally. Our work is inspired
                 by and generalizes some aspects of semantic
                 inconsistency detection. Our formulation allows us to
                 lift this idea to a general class of warnings. We have
                 developed a prototype {\tt acspec}, which we use to
                 explore a few instantiations of the framework and
                 report preliminary findings on a diverse set of C
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Cook:2013:RAN,
  author =       "Byron Cook and Eric Koskinen",
  title =        "Reasoning about nondeterminism in programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "219--230",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2491969",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Branching-time temporal logics (e.g. CTL, CTL*, modal
                 mu-calculus) allow us to ask sophisticated questions
                 about the nondeterminism that appears in systems.
                 Applications of this type of reasoning include
                 planning, games, security analysis, disproving,
                 precondition synthesis, environment synthesis, etc.
                 Unfortunately, existing automatic branching-time
                 verification tools have limitations that have
                 traditionally restricted their applicability (e.g.
                 push-down systems only, universal path quantifiers
                 only, etc). In this paper we introduce an automation
                 strategy that lifts many of these previous
                 restrictions. Our method works reliably for properties
                 with non-trivial mixtures of universal and existential
                 modal operators. Furthermore, our approach is designed
                 to support (possibly infinite-state) programs. The
                 basis of our approach is the observation that
                 existential reasoning can be reduced to universal
                 reasoning if the system's state-space is appropriately
                 restricted. This restriction on the state-space must
                 meet a constraint derived from recent work on proving
                 non-termination. The observation leads to a new route
                 for implementation based on existing tools. To
                 demonstrate the practical viability of our approach, we
                 report on the results applying our preliminary
                 implementation to a set of benchmarks drawn from the
                 Windows operating system, the PostgreSQL database
                 server, SoftUpdates patching system, as well as other
                 hand-crafted examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Qiu:2013:NPS,
  author =       "Xiaokang Qiu and Pranav Garg and Andrei Stefanescu and
                 Parthasarathy Madhusudan",
  title =        "Natural proofs for structure, data, and separation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "231--242",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462169",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose natural proofs for reasoning with programs
                 that manipulate data-structures against specifications
                 that describe the structure of the heap, the data
                 stored within it, and separation and framing of
                 sub-structures. Natural proofs are a subclass of proofs
                 that are amenable to completely automated reasoning,
                 that provide sound but incomplete procedures, and that
                 capture common reasoning tactics in program
                 verification. We develop a dialect of separation logic
                 over heaps, called Dryad, with recursive definitions
                 that avoids explicit quantification. We develop ways to
                 reason with heaplets using classical logic over the
                 theory of sets, and develop natural proofs for
                 reasoning using proof tactics involving disciplined
                 unfoldings and formula abstractions. Natural proofs are
                 encoded into decidable theories of first-order logic so
                 as to be discharged using SMT solvers. We also
                 implement the technique and show that a large class of
                 more than 100 correct programs that manipulate
                 data-structures are amenable to full functional
                 correctness using the proposed natural proof method.
                 These programs are drawn from a variety of sources
                 including standard data-structures, the Schorr--Waite
                 algorithm for garbage collection, a large number of
                 low-level C routines from the Glib library and OpenBSD
                 library, the Linux kernel, and routines from a secure
                 verified OS-browser project. Our work is the first that
                 we know of that can handle such a wide range of full
                 functional verification properties of heaps
                 automatically, given pre/post and loop invariant
                 annotations. We believe that this work paves the way
                 for deductive verification technology to be used by
                 programmers who do not (and need not) understand the
                 internals of the underlying logic solvers,
                 significantly increasing their applicability in
                 building reliable systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Yu:2013:GDS,
  author =       "Hongtao Yu and Hou-Jen Ko and Zhiyuan Li",
  title =        "General data structure expansion for multi-threading",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "243--252",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462182",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Among techniques for parallelizing sequential codes,
                 privatization is a common and significant
                 transformation performed by both compilers and runtime
                 parallelizing systems. Without privatization,
                 repetitive updates to the same data structures often
                 introduce spurious data dependencies that hide the
                 inherent parallelism. Unfortunately, it remains a
                 significant challenge to compilers to automatically
                 privatize dynamic and recursive data structures which
                 appear frequently in real applications written in
                 languages such as C/C++. This is because such languages
                 lack a naming mechanism to define the address range of
                 a pointer-based data structure, in contrast to arrays
                 with explicitly declared bounds. In this paper we
                 present a novel solution to this difficult problem by
                 expanding general data structures such that memory
                 accesses issued from different threads to contentious
                 data structures are directed to different data fields.
                 Based on compile-time type checking and a data
                 dependence graph, this aggressive extension to the
                 traditional scalar and array expansion isolates the
                 address ranges among different threads, without
                 struggling with privatization based on thread-private
                 stacks, such that the targeted loop can be effectively
                 parallelized. With this method fully implemented in
                 GCC, experiments are conducted on a set of programs
                 from well-known benchmark suites such as Mibench,
                 MediaBench II and SPECint. Results show that the new
                 approach can lead to a high speedup when executing the
                 transformed code on multiple cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Hung:2013:AAS,
  author =       "Wei-Lun Hung and Vijay K. Garg",
  title =        "{AutoSynch}: an automatic-signal monitor based on
                 predicate tagging",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "253--262",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462175",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "{Most programming languages use monitors with explicit
                 signals for synchronization in shared-memory programs.
                 Requiring programmers to signal threads explicitly
                 results in many concurrency bugs due to missed
                 notifications, or notifications on wrong condition
                 variables. In this paper, we describe an implementation
                 of an automatic signaling monitor in Java called
                 AutoSynch that eliminates such concurrency bugs by
                 removing the burden of signaling from the programmer.
                 We show that the belief that automatic signaling
                 monitors are prohibitively expensive is wrong. For most
                 problems, programs based on AutoSynch are almost as
                 fast as those based on explicit signaling. For some,
                 AutoSynch is even faster than explicit signaling
                 because it never uses signalAll, whereas the
                 programmers end up using signalAll with the explicit
                 signal mechanism. AutoSynch} achieves efficiency in
                 synchronization based on three novel ideas. We
                 introduce an operation called closure that enables the
                 predicate evaluation in every thread, thereby reducing
                 context switches during the execution of the program.
                 Secondly, AutoSynch avoids signalAll by using a
                 property called relay invariance that guarantees that
                 whenever possible there is always at least one thread
                 whose condition is true which has been signaled.
                 Finally, AutoSynch uses a technique called predicate
                 tagging to efficiently determine a thread that should
                 be signaled. To evaluate the efficiency of AutoSynch,
                 we have implemented many different well-known
                 synchronization problems such as the
                 producers/consumers problem, the readers/writers
                 problems, and the dining philosophers problem. The
                 results show that AutoSynch is almost as efficient as
                 the explicit-signal monitor and even more efficient for
                 some cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Golan-Gueta:2013:CLF,
  author =       "Guy Golan-Gueta and G. Ramalingam and Mooly Sagiv and
                 Eran Yahav",
  title =        "Concurrent libraries with foresight",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "263--274",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462172",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Linearizable libraries provide operations that appear
                 to execute atomically. Clients, however, may need to
                 execute a sequence of operations (a composite
                 operation) atomically. We consider the problem of
                 extending a linearizable library to support arbitrary
                 atomic composite operations by clients. We introduce a
                 novel approach in which the concurrent library ensures
                 atomicity of composite operations by exploiting
                 information (foresight) provided by its clients. We use
                 a correctness condition, based on a notion of dynamic
                 right-movers, that guarantees that composite operations
                 execute atomically without deadlocks, and without using
                 rollbacks. We present a static analysis to infer the
                 foresight information required by our approach,
                 allowing a compiler to automatically insert the
                 foresight information into the client. This relieves
                 the client programmer of this burden and simplifies
                 writing client code. We present a generic technique for
                 extending the library implementation to realize
                 foresight-based synchronization. This technique is used
                 to implement a general-purpose Java library for Map
                 data structures --- the library permits composite
                 operations to simultaneously work with multiple
                 instances of Map data structures. We use the Maps
                 library and the static analysis to enforce atomicity of
                 a wide selection of real-life Java composite
                 operations. Our experiments indicate that our approach
                 enables realizing efficient and scalable
                 synchronization for real-life composite operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Blackshear:2013:TPR,
  author =       "Sam Blackshear and Bor-Yuh Evan Chang and Manu
                 Sridharan",
  title =        "{Thresher}: precise refutations for heap
                 reachability",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "275--286",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462186",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a precise, path-sensitive static analysis
                 for reasoning about heap reachability, that is, whether
                 an object can be reached from another variable or
                 object via pointer dereferences. Precise reachability
                 information is useful for a number of clients,
                 including static detection of a class of Android memory
                 leaks. For this client, we found the heap reachability
                 information computed by a state-of-the-art points-to
                 analysis was too imprecise, leading to numerous
                 false-positive leak reports. Our analysis combines a
                 symbolic execution capable of path-sensitivity and
                 strong updates with abstract heap information computed
                 by an initial flow-insensitive points-to analysis. This
                 novel mixed representation allows us to achieve both
                 precision and scalability by leveraging the
                 pre-computed points-to facts to guide execution and
                 prune infeasible paths. We have evaluated our
                 techniques in the Thresher tool, which we used to find
                 several developer-confirmed leaks in Android
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Udupa:2013:TSP,
  author =       "Abhishek Udupa and Arun Raghavan and Jyotirmoy V.
                 Deshmukh and Sela Mador-Haim and Milo M. K. Martin and
                 Rajeev Alur",
  title =        "{TRANSIT}: specifying protocols with concolic
                 snippets",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "287--296",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462174",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the maturing of technology for model checking and
                 constraint solving, there is an emerging opportunity to
                 develop programming tools that can transform the way
                 systems are specified. In this paper, we propose a new
                 way to program distributed protocols using concolic
                 snippets. Concolic snippets are sample execution
                 fragments that contain both concrete and symbolic
                 values. The proposed approach allows the programmer to
                 describe the desired system partially using the
                 traditional model of communicating extended
                 finite-state-machines (EFSM), along with high-level
                 invariants and concrete execution fragments. Our
                 synthesis engine completes an EFSM skeleton by
                 inferring guards and updates from the given fragments
                 which is then automatically analyzed using a model
                 checker with respect to the desired invariants. The
                 counterexamples produced by the model checker can then
                 be used by the programmer to add new concrete execution
                 fragments that describe the correct behavior in the
                 specific scenario corresponding to the counterexample.
                 We describe TRANSIT, a language and prototype
                 implementation of the proposed specification
                 methodology for distributed protocols. Experimental
                 evaluations of TRANSIT to specify cache coherence
                 protocols show that (1) the algorithm for expression
                 inference from concolic snippets can synthesize
                 expressions of size 15 involving typical operators over
                 commonly occurring types, (2) for a classical
                 directory-based protocol, TRANSIT automatically
                 generates, in a few seconds, a complete implementation
                 from a specification consisting of the EFSM structure
                 and a few concrete examples for every transition, and
                 (3) a published partial description of the SGI Origin
                 cache coherence protocol maps directly to symbolic
                 examples and leads to a complete implementation in a
                 few iterations, with the programmer correcting
                 counterexamples resulting from underspecified
                 transitions by adding concrete examples in each
                 iteration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Gao:2013:UMR,
  author =       "Tiejun Gao and Karin Strauss and Stephen M. Blackburn
                 and Kathryn S. McKinley and Doug Burger and James
                 Larus",
  title =        "Using managed runtime systems to tolerate holes in
                 wearable memories",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "297--308",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462171",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "{New memory technologies, such as phase-change memory
                 (PCM), promise denser and cheaper main memory, and are
                 expected to displace DRAM. However, many of them
                 experience permanent failures far more quickly than
                 DRAM. DRAM mechanisms that handle permanent failures
                 rely on very low failure rates and, if directly applied
                 to PCM, are extremely inefficient: Discarding a page
                 when the first line fails wastes 98\% of the memory.
                 This paper proposes low complexity cooperative software
                 and hardware that handle failure rates as high as 50\%.
                 Our approach makes error handling transparent to the
                 application by using the memory abstraction offered by
                 managed languages. Once hardware error correction for a
                 memory line is exhausted, rather than discarding the
                 entire page, the hardware communicates the failed line
                 to a failure-aware OS and runtime. The runtime ensures
                 memory allocations never use failed lines and moves
                 data when lines fail during program execution. This
                 paper describes minimal extensions to an Immix
                 mark-region garbage collector, which correctly utilizes
                 pages with failed physical lines by skipping over
                 failures. This paper also proposes hardware support
                 that clusters failed lines at one end of a memory
                 region to reduce fragmentation and improve performance
                 under failures. Contrary to accepted hardware wisdom
                 that advocates for wear-leveling, we show that with
                 software support non-uniform failures delay the impact
                 of memory failure. Together, these mechanisms incur no
                 performance overhead when there are no failures and at
                 failure levels of 10\% to 50\% suffer only an average
                 overhead of 4\% and 12\%}, respectively. These results
                 indicate that hardware and software cooperation can
                 greatly extend the life of wearable memories.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Cohen:2013:LPC,
  author =       "Nachshon Cohen and Erez Petrank",
  title =        "Limitations of partial compaction: towards practical
                 bounds",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "309--320",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2491973",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compaction of a managed heap is considered a costly
                 operation, and is avoided as much as possible in
                 commercial runtimes. Instead, partial compaction is
                 often used to defragment parts of the heap and avoid
                 space blow up. Previous study of compaction limitation
                 provided some initial asymptotic bounds but no
                 implications for practical systems. In this work, we
                 extend the theory to obtain better bounds and make them
                 strong enough to become meaningful for modern
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Desai:2013:PSA,
  author =       "Ankush Desai and Vivek Gupta and Ethan Jackson and
                 Shaz Qadeer and Sriram Rajamani and Damien Zufferey",
  title =        "{P}: safe asynchronous event-driven programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "321--332",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462184",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe the design and implementation of P, a
                 domain-specific language to write asynchronous event
                 driven code. P allows the programmer to specify the
                 system as a collection of interacting state machines,
                 which communicate with each other using events. P
                 unifies modeling and programming into one activity for
                 the programmer. Not only can a P program be compiled
                 into executable code, but it can also be tested using
                 model checking techniques. P allows the programmer to
                 specify the environment, used to ``close'' the system
                 during testing, as nondeterministic ghost machines.
                 Ghost machines are erased during compilation to
                 executable code; a type system ensures that the erasure
                 is semantics preserving. The P language is designed so
                 that a P program can be checked for
                 responsiveness---the ability to handle every event in a
                 timely manner. By default, a machine needs to handle
                 every event that arrives in every state. But handling
                 every event in every state is impractical. The language
                 provides a notion of deferred events where the
                 programmer can annotate when she wants to delay
                 processing an event. The default safety checker looks
                 for presence of unhandled events. The language also
                 provides default liveness checks that an event cannot
                 be potentially deferred forever. P was used to
                 implement and verify the core of the USB device driver
                 stack that ships with Microsoft Windows 8. The
                 resulting driver is more reliable and performs better
                 than its prior incarnation (which did not use P); we
                 have more confidence in the robustness of its design
                 due to the language abstractions and verification
                 provided by P.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Green:2013:QSQ,
  author =       "Alexander S. Green and Peter LeFanu Lumsdaine and Neil
                 J. Ross and Peter Selinger and Beno{\^\i}t Valiron",
  title =        "{Quipper}: a scalable quantum programming language",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "333--342",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462177",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The field of quantum algorithms is vibrant. Still,
                 there is currently a lack of programming languages for
                 describing quantum computation on a practical scale,
                 i.e., not just at the level of toy problems. We address
                 this issue by introducing Quipper, a scalable,
                 expressive, functional, higher-order quantum
                 programming language. Quipper has been used to program
                 a diverse set of non-trivial quantum algorithms, and
                 can generate quantum gate representations using
                 trillions of gates. It is geared towards a model of
                 computation that uses a classical computer to control a
                 quantum device, but is not dependent on any particular
                 model of quantum hardware. Quipper has proven effective
                 and easy to use, and opens the door towards using
                 formal methods to analyze quantum algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Isradisaikul:2013:REP,
  author =       "Chinawat Isradisaikul and Andrew C. Myers",
  title =        "Reconciling exhaustive pattern matching with objects",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "343--354",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462194",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Pattern matching, an important feature of functional
                 languages, is in conflict with data abstraction and
                 extensibility, which are central to object-oriented
                 languages. Modal abstraction offers an integration of
                 deep pattern matching and convenient iteration
                 abstractions into an object-oriented setting; however,
                 because of data abstraction, it is challenging for a
                 compiler to statically verify properties such as
                 exhaustiveness. In this work, we extend modal
                 abstraction in the JMatch language to support static,
                 modular reasoning about exhaustiveness and redundancy.
                 New matching specifications allow these properties to
                 be checked using an SMT solver. We also introduce
                 expressive pattern-matching constructs. Our evaluation
                 shows that these new features enable more concise code
                 and that the performance of checking exhaustiveness and
                 redundancy is acceptable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Bodden:2013:SLS,
  author =       "Eric Bodden and T{\'a}rsis Tol{\^e}do and M{\'a}rcio
                 Ribeiro and Claus Brabrand and Paulo Borba and Mira
                 Mezini",
  title =        "{SPL LIFT}: statically analyzing software product
                 lines in minutes instead of years",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "355--364",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2491976",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A software product line (SPL) encodes a potentially
                 large variety of software products as variants of some
                 common code base. Up until now, re-using traditional
                 static analyses for SPLs was virtually intractable, as
                 it required programmers to generate and analyze all
                 products individually. In this work, however, we show
                 how an important class of existing inter-procedural
                 static analyses can be transparently lifted to SPLs.
                 Without requiring programmers to change a single line
                 of code, our approach SPLLIFT automatically converts
                 any analysis formulated for traditional programs within
                 the popular IFDS framework for inter-procedural,
                 finite, distributive, subset problems to an SPL-aware
                 analysis formulated in the IDE framework, a well-known
                 extension to IFDS. Using a full implementation based on
                 Heros, Soot, CIDE and JavaBDD, we show that with
                 SPLLIFT one can reuse IFDS-based analyses without
                 changing a single line of code. Through experiments
                 using three static analyses applied to four Java-based
                 product lines, we were able to show that our approach
                 produces correct results and outperforms the
                 traditional approach by several orders of magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Zhang:2013:FOA,
  author =       "Xin Zhang and Mayur Naik and Hongseok Yang",
  title =        "Finding optimum abstractions in parametric dataflow
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "365--376",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462185",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a technique to efficiently search a large
                 family of abstractions in order to prove a query using
                 a parametric dataflow analysis. Our technique either
                 finds the cheapest such abstraction or shows that none
                 exists. It is based on counterexample-guided
                 abstraction refinement but applies a novel
                 meta-analysis on abstract counterexample traces to
                 efficiently find abstractions that are incapable of
                 proving the query. We formalize the technique in a
                 generic framework and apply it to two analyses: a
                 type-state analysis and a thread-escape analysis. We
                 demonstrate the effectiveness of the technique on a
                 suite of Java benchmark programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Apinis:2013:HCW,
  author =       "Kalmer Apinis and Helmut Seidl and Vesal Vojdani",
  title =        "How to combine widening and narrowing for
                 non-monotonic systems of equations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "377--386",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462190",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Non-trivial analysis problems require complete
                 lattices with infinite ascending and descending chains.
                 In order to compute reasonably precise post-fixpoints
                 of the resulting systems of equations, Cousot and
                 Cousot have suggested accelerated fixpoint iteration by
                 means of widening and narrowing. The strict separation
                 into phases, however, may unnecessarily give up
                 precision that cannot be recovered later. While
                 widening is also applicable if equations are
                 non-monotonic, this is no longer the case for
                 narrowing. A narrowing iteration to improve a given
                 post-fixpoint, additionally, must assume that all
                 right-hand sides are monotonic. The latter assumption,
                 though, is not met in presence of widening. It is also
                 not met by equation systems corresponding to
                 context-sensitive interprocedural analysis, possibly
                 combining context-sensitive analysis of local
                 information with flow-insensitive analysis of globals.
                 As a remedy, we present a novel operator that combines
                 a given widening operator with a given narrowing
                 operator. We present adapted versions of round-robin as
                 well as of worklist iteration, local, and
                 side-effecting solving algorithms for the combined
                 operator and prove that the resulting solvers always
                 return sound results and are guaranteed to terminate
                 for monotonic systems whenever only finitely many
                 unknowns (constraint variables) are encountered.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Swamy:2013:VHO,
  author =       "Nikhil Swamy and Joel Weinberger and Cole Schlesinger
                 and Juan Chen and Benjamin Livshits",
  title =        "Verifying higher-order programs with the {Dijkstra}
                 monad",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "387--398",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2491978",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern programming languages, ranging from Haskell and
                 ML, to JavaScript, C\# and Java, all make extensive use
                 of higher-order state. This paper advocates a new
                 verification methodology for higher-order stateful
                 programs, based on a new monad of predicate
                 transformers called the Dijkstra monad. Using the
                 Dijkstra monad has a number of benefits. First, the
                 monad naturally yields a weakest pre-condition
                 calculus. Second, the computed specifications are
                 structurally simpler in several ways, e.g.,
                 single-state post-conditions are sufficient (rather
                 than the more complex two-state post-conditions).
                 Finally, the monad can easily be varied to handle
                 features like exceptions and heap invariants, while
                 retaining the same type inference algorithm. We
                 implement the Dijkstra monad and its type inference
                 algorithm for the F* programming language. Our most
                 extensive case study evaluates the Dijkstra monad and
                 its F* implementation by using it to verify JavaScript
                 programs. Specifically, we describe a tool chain that
                 translates programs in a subset of JavaScript decorated
                 with assertions and loop invariants to F*. Once in F*,
                 our type inference algorithm computes verification
                 conditions and automatically discharges their proofs
                 using an SMT solver. We use our tools to prove that a
                 core model of the JavaScript runtime in F* respects
                 various invariants and that a suite of JavaScript
                 source programs are free of runtime errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Sergey:2013:MAI,
  author =       "Ilya Sergey and Dominique Devriese and Matthew Might
                 and Jan Midtgaard and David Darais and Dave Clarke and
                 Frank Piessens",
  title =        "Monadic abstract interpreters",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "399--410",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2491979",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent developments in the systematic construction of
                 abstract interpreters hinted at the possibility of a
                 broad unification of concepts in static analysis. We
                 deliver that unification by showing
                 context-sensitivity, polyvariance, flow-sensitivity,
                 reachability-pruning, heap-cloning and
                 cardinality-bounding to be independent of any
                 particular semantics. Monads become the unifying agent
                 between these concepts and between semantics. For
                 instance, by plugging the same ``context-insensitivity
                 monad'' into a monadically-parameterized semantics for
                 Java or for the lambda calculus, it yields the expected
                 context-insensitive analysis. To achieve this
                 unification, we develop a systematic method for
                 transforming a concrete semantics into a
                 monadically-parameterized abstract machine. Changing
                 the monad changes the behavior of the machine. By
                 changing the monad, we recover a spectrum of
                 machines---from the original concrete semantics to a
                 monovariant, flow- and context-insensitive static
                 analysis with a singly-threaded heap and weak updates.
                 The monadic parameterization also suggests an
                 abstraction over the ubiquitous monotone fixed-point
                 computation found in static analysis. This abstraction
                 makes it straightforward to instrument an analysis with
                 high-level strategies for improving precision and
                 performance, such as abstract garbage collection and
                 widening. While the paper itself runs the development
                 for continuation-passing style, our generic
                 implementation replays it for direct-style
                 lambda-calculus and Featherweight Java to support
                 generality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Czaplicki:2013:AFR,
  author =       "Evan Czaplicki and Stephen Chong",
  title =        "Asynchronous functional reactive programming for
                 {GUIs}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "411--422",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462161",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphical user interfaces (GUIs) mediate many of our
                 interactions with computers. Functional Reactive
                 Programming (FRP) is a promising approach to GUI
                 design, providing high-level, declarative,
                 compositional abstractions to describe user
                 interactions and time-dependent computations. We
                 present Elm, a practical FRP language focused on easy
                 creation of responsive GUIs. Elm has two major
                 features: simple declarative support for Asynchronous
                 FRP; and purely functional graphical layout.
                 Asynchronous FRP allows the programmer to specify when
                 the global ordering of event processing can be
                 violated, and thus enables efficient concurrent
                 execution of FRP programs; long-running computation can
                 be executed asynchronously and not adversely affect the
                 responsiveness of the user interface. Layout in Elm is
                 achieved using a purely functional declarative
                 framework that makes it simple to create and combine
                 text, images, and video into rich multimedia displays.
                 Together, Elm's two major features simplify the
                 complicated task of creating responsive and usable
                 GUIs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Kastrinis:2013:HCS,
  author =       "George Kastrinis and Yannis Smaragdakis",
  title =        "Hybrid context-sensitivity for points-to analysis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "423--434",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462191",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Context-sensitive points-to analysis is valuable for
                 achieving high precision with good performance. The
                 standard flavors of context-sensitivity are
                 call-site-sensitivity (kCFA) and object-sensitivity.
                 Combining both flavors of context-sensitivity increases
                 precision but at an infeasibly high cost. We show that
                 a selective combination of call-site- and
                 object-sensitivity for Java points-to analysis is
                 highly profitable. Namely, by keeping a combined
                 context only when analyzing selected language features,
                 we can closely approximate the precision of an analysis
                 that keeps both contexts at all times. In terms of
                 speed, the selective combination of both kinds of
                 context not only vastly outperforms non-selective
                 combinations but is also faster than a mere
                 object-sensitive analysis. This result holds for a
                 large array of analyses (e.g., 1-object-sensitive,
                 2-object-sensitive with a context-sensitive heap,
                 type-sensitive) establishing a new set of
                 performance/precision sweet spots.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Zhang:2013:FAD,
  author =       "Qirun Zhang and Michael R. Lyu and Hao Yuan and
                 Zhendong Su",
  title =        "Fast algorithms for {Dyck--CFL}-reachability with
                 applications to alias analysis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "435--446",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462159",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The context-free language (CFL) reachability problem
                 is a well-known fundamental formulation in program
                 analysis. In practice, many program analyses,
                 especially pointer analyses, adopt a restricted version
                 of CFL-reachability, Dyck-CFL-reachability, and compute
                 on edge-labeled bidirected graphs. Solving the
                 all-pairs Dyck-CFL-reachability on such bidirected
                 graphs is expensive. For a bidirected graph with n
                 nodes and m edges, the traditional dynamic programming
                 style algorithm exhibits a subcubic time complexity for
                 the Dyck language with k kinds of parentheses. When the
                 underlying graphs are restricted to bidirected trees,
                 an algorithm with O(n log n log k) time complexity was
                 proposed recently. This paper studies the
                 Dyck-CFL-reachability problems on bidirected trees and
                 graphs. In particular, it presents two fast algorithms
                 with O(n) and O(n + m log m) time complexities on trees
                 and graphs respectively. We have implemented and
                 evaluated our algorithms on a state-of-the-art alias
                 analysis for Java. Results on standard benchmarks show
                 that our algorithms achieve orders of magnitude speedup
                 and consume less memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Sankaranarayanan:2013:SAP,
  author =       "Sriram Sankaranarayanan and Aleksandar Chakarov and
                 Sumit Gulwani",
  title =        "Static analysis for probabilistic programs: inferring
                 whole program properties from finitely many paths",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "447--458",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462179",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose an approach for the static analysis of
                 probabilistic programs that sense, manipulate, and
                 control based on uncertain data. Examples include
                 programs used in risk analysis, medical decision making
                 and cyber-physical systems. Correctness properties of
                 such programs take the form of queries that seek the
                 probabilities of assertions over program variables. We
                 present a static analysis approach that provides
                 guaranteed interval bounds on the values (assertion
                 probabilities) of such queries. First, we observe that
                 for probabilistic programs, it is possible to conclude
                 facts about the behavior of the entire program by
                 choosing a finite, adequate set of its paths. We
                 provide strategies for choosing such a set of paths and
                 verifying its adequacy. The queries are evaluated over
                 each path by a combination of symbolic execution and
                 probabilistic volume-bound computations. Each path
                 yields interval bounds that can be summed up with a
                 ``coverage'' bound to yield an interval that encloses
                 the probability of assertion for the program as a
                 whole. We demonstrate promising results on a suite of
                 benchmarks from many different sources including
                 robotic manipulators and medical decision making
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Liang:2013:MVL,
  author =       "Hongjin Liang and Xinyu Feng",
  title =        "Modular verification of linearizability with non-fixed
                 linearization points",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "459--470",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462189",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Locating linearization points (LPs) is an intuitive
                 approach for proving linearizability, but it is
                 difficult to apply the idea in Hoare-style logic for
                 formal program verification, especially for verifying
                 algorithms whose LPs cannot be statically located in
                 the code. In this paper, we propose a program logic
                 with a lightweight instrumentation mechanism which can
                 verify algorithms with non-fixed LPs, including the
                 most challenging ones that use the helping mechanism to
                 achieve lock-freedom (as in HSY elimination-based
                 stack), or have LPs depending on unpredictable future
                 executions (as in the lazy set algorithm), or involve
                 both features. We also develop a thread-local
                 simulation as the meta-theory of our logic, and show it
                 implies contextual refinement, which is equivalent to
                 linearizability. Using our logic we have successfully
                 verified various classic algorithms, some of which are
                 used in the java.util.concurrent package.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Sewell:2013:TVV,
  author =       "Thomas Arthur Leck Sewell and Magnus O. Myreen and
                 Gerwin Klein",
  title =        "Translation validation for a verified {OS} kernel",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "471--482",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462183",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We extend the existing formal verification of the seL4
                 operating system microkernel from 9500 lines of C
                 source code to the binary level. We handle all
                 functions that were part of the previous verification.
                 Like the original verification, we currently omit the
                 assembly routines and volatile accesses used to control
                 system hardware. More generally, we present an approach
                 for proving refinement between the formal semantics of
                 a program on the C source level and its formal
                 semantics on the binary level, thus checking the
                 validity of compilation, including some optimisations,
                 and linking, and extending static properties proved of
                 the source code to the executable. We make use of
                 recent improvements in SMT solvers to almost fully
                 automate this process. We handle binaries generated by
                 unmodified gcc 4.5.1 at optimisation level 1, and can
                 handle most of seL4 even at optimisation level 2.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Guha:2013:MVN,
  author =       "Arjun Guha and Mark Reitblatt and Nate Foster",
  title =        "Machine-verified network controllers",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "483--494",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462178",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In many areas of computing, techniques ranging from
                 testing to formal modeling to full-blown verification
                 have been successfully used to help programmers build
                 reliable systems. But although networks are critical
                 infrastructure, they have largely resisted analysis
                 using formal techniques. Software-defined networking
                 (SDN) is a new network architecture that has the
                 potential to provide a foundation for network
                 reasoning, by standardizing the interfaces used to
                 express network programs and giving them a precise
                 semantics. This paper describes the design and
                 implementation of the first machine-verified SDN
                 controller. Starting from the foundations, we develop a
                 detailed operational model for OpenFlow (the most
                 popular SDN platform) and formalize it in the Coq proof
                 assistant. We then use this model to develop a verified
                 compiler and run-time system for a high-level network
                 programming language. We identify bugs in existing
                 languages and tools built without formal foundations,
                 and prove that these bugs are absent from our system.
                 Finally, we describe our prototype implementation and
                 our experiences using it to build practical
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Nowatzki:2013:GCC,
  author =       "Tony Nowatzki and Michael Sartin-Tarm and Lorenzo {De
                 Carli} and Karthikeyan Sankaralingam and Cristian Estan
                 and Behnam Robatmili",
  title =        "A general constraint-centric scheduling framework for
                 spatial architectures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "495--506",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462163",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Specialized execution using spatial architectures
                 provides energy efficient computation, but requires
                 effective algorithms for spatially scheduling the
                 computation. Generally, this has been solved with
                 architecture-specific heuristics, an approach which
                 suffers from poor compiler/architect productivity, lack
                 of insight on optimality, and inhibits migration of
                 techniques between architectures. Our goal is to
                 develop a scheduling framework usable for all spatial
                 architectures. To this end, we expresses spatial
                 scheduling as a constraint satisfaction problem using
                 Integer Linear Programming (ILP). We observe that
                 architecture primitives and scheduler responsibilities
                 can be related through five abstractions: placement of
                 computation, routing of data, managing event timing,
                 managing resource utilization, and forming the
                 optimization objectives. We encode these
                 responsibilities as 20 general ILP constraints, which
                 are used to create schedulers for the disparate TRIPS,
                 DySER, and PLUG architectures. Our results show that a
                 general declarative approach using ILP is
                 implementable, practical, and typically matches or
                 outperforms specialized schedulers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Lifflander:2013:STL,
  author =       "Jonathan Lifflander and Sriram Krishnamoorthy and
                 Laxmikant V. Kale",
  title =        "{Steal Tree}: low-overhead tracing of work stealing
                 schedulers",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "507--518",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462193",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Work stealing is a popular approach to scheduling
                 task-parallel programs. The flexibility inherent in
                 work stealing when dealing with load imbalance results
                 in seemingly irregular computation structures,
                 complicating the study of its runtime behavior. In this
                 paper, we present an approach to efficiently trace
                 async-finish parallel programs scheduled using work
                 stealing. We identify key properties that allow us to
                 trace the execution of tasks with low time and space
                 overheads. We also study the usefulness of the proposed
                 schemes in supporting algorithms for data-race
                 detection and retentive stealing presented in the
                 literature. We demonstrate that the perturbation due to
                 tracing is within the variation in the execution time
                 with 99\% confidence and the traces are concise,
                 amounting to a few tens of kilobytes per thread in most
                 cases. We also demonstrate that the traces enable
                 significant reductions in the cost of detecting data
                 races and result in low, stable space overheads in
                 supporting retentive stealing for async-finish
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Ragan-Kelley:2013:HLC,
  author =       "Jonathan Ragan-Kelley and Connelly Barnes and Andrew
                 Adams and Sylvain Paris and Fr{\'e}do Durand and Saman
                 Amarasinghe",
  title =        "{Halide}: a language and compiler for optimizing
                 parallelism, locality, and recomputation in image
                 processing pipelines",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "6",
  pages =        "519--530",
  month =        jun,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2499370.2462176",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Jul 1 17:15:38 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Image processing pipelines combine the challenges of
                 stencil computations and stream programs. They are
                 composed of large graphs of different stencil stages,
                 as well as complex reductions, and stages with global
                 or data-dependent access patterns. Because of their
                 complex structure, the performance difference between a
                 naive implementation of a pipeline and an optimized one
                 is often an order of magnitude. Efficient
                 implementations require optimization of both
                 parallelism and locality, but due to the nature of
                 stencils, there is a fundamental tension between
                 parallelism, locality, and introducing redundant
                 recomputation of shared values. We present a systematic
                 model of the tradeoff space fundamental to stencil
                 pipelines, a schedule representation which describes
                 concrete points in this space for each stage in an
                 image processing pipeline, and an optimizing compiler
                 for the Halide image processing language that
                 synthesizes high performance implementations from a
                 Halide algorithm and a schedule. Combining this
                 compiler with stochastic search over the space of
                 schedules enables terse, composable programs to achieve
                 state-of-the-art performance on a wide range of real
                 image processing pipelines, and across different
                 hardware architectures, including multicores with SIMD,
                 and heterogeneous CPU+GPU execution. From simple Halide
                 programs written in a few hours, we demonstrate
                 performance up to 5x faster than hand-tuned C,
                 intrinsics, and CUDA implementations optimized by
                 experts over weeks or months, for image processing
                 applications beyond the reach of past automatic
                 compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '13 conference proceedings.",
}

@Article{Jia:2013:SID,
  author =       "Ning Jia and Chun Yang and Jing Wang and Dong Tong and
                 Keyi Wang",
  title =        "{SPIRE}: improving dynamic binary translation through
                 {SPC}-indexed indirect branch redirecting",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "1--12",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451516",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Dynamic binary translation system must perform an
                 address translation for every execution of indirect
                 branch instructions. The procedure to convert Source
                 binary Program Counter (SPC) address to Translated
                 Program Counter (TPC) address always takes more than 10
                 instructions, becoming a major source of performance
                 overhead. This paper proposes a novel mechanism called
                 SPc-Indexed REdirecting (SPIRE), which can
                 significantly reduce the indirect branch handling
                 overhead. SPIRE doesn't rely on hash lookup and address
                 mapping table to perform address translation. It reuses
                 the source binary code space to build a SPC-indexed
                 redirecting table. This table can be indexed directly
                 by SPC address without hashing. With SPIRE, the
                 indirect branch can jump to the originally SPC address
                 without address translation. The trampoline residing in
                 the SPC address will redirect the control flow to
                 related code cache. Only 2-6 instructions are needed to
                 handle an indirect branch execution. As part of the
                 source binary would be overwritten, a shadow page
                 mechanism is explored to keep transparency of the
                 corrupt source binary code page. Online profiling is
                 adopted to reduce the memory overhead. We have
                 implemented SPIRE on an x86 to x86 DBT system, and
                 discussed the implementation issues on different guest
                 and host architectures. The experiments show that,
                 compared with hash lookup mechanism, SPIRE can reduce
                 the performance overhead by 36.2\% on average, up to
                 51.4\%, while only 5.6\% extra memory is needed. SPIRE
                 can cooperate with other indirect branch handling
                 mechanisms easily, and we believe the idea of SPIRE can
                 also be applied on other occasions that need address
                 translation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{vonKoch:2013:LRB,
  author =       "Tobias J. K. Edler von Koch and Bj{\"o}rn Franke",
  title =        "Limits of region-based dynamic binary
                 parallelization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "13--22",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Efficiently executing sequential legacy binaries on
                 chip multi-processors (CMPs) composed of many, small
                 cores is one of today's most pressing problems.
                 Single-threaded execution is a suboptimal option due to
                 CMPs' lower single-core performance, while
                 multi-threaded execution relies on prior
                 parallelization, which is severely hampered by the
                 low-level binary representation of applications
                 compiled and optimized for a single-core target. A
                 recent technology to address this problem is Dynamic
                 Binary Parallelization (DBP), which creates a Virtual
                 Execution Environment (VEE) taking advantage of the
                 underlying multicore host to transparently parallelize
                 the sequential binary executable. While still in its
                 infancy, DBP has received broad interest within the
                 research community. The combined use of DBP and
                 thread-level speculation (TLS) has been proposed as a
                 technique to accelerate legacy uniprocessor code on
                 modern CMPs. In this paper, we investigate the limits
                 of DBP and seek to gain an understanding of the factors
                 contributing to these limits and the costs and
                 overheads of its implementation. We have performed an
                 extensive evaluation using a parameterizable DBP system
                 targeting a CMP with light-weight architectural TLS
                 support. We demonstrate that there is room for a
                 significant reduction of up to 54\% in the number of
                 instructions on the critical paths of legacy SPEC
                 CPU2006 benchmarks. However, we show that it is much
                 harder to translate these savings into actual
                 performance improvements, with a realistic
                 hardware-supported implementation achieving a speedup
                 of 1.09 on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hsu:2013:IDB,
  author =       "Chun-Chen Hsu and Pangfeng Liu and Jan-Jan Wu and
                 Pen-Chung Yew and Ding-Yong Hong and Wei-Chung Hsu and
                 Chien-Min Wang",
  title =        "Improving dynamic binary optimization through
                 early-exit guided code region formation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "23--32",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451519",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Most dynamic binary translators (DBT) and optimizers
                 (DBO) target binary traces, i.e. frequently executed
                 paths, as code regions to be translated and optimized.
                 Code region formation is the most important first step
                 in all DBTs and DBOs. The quality of the dynamically
                 formed code regions determines the extent and the types
                 of optimization opportunities that can be exposed to
                 DBTs and DBOs, and thus, determines the ultimate
                 quality of the final optimized code. The
                 Next-Executing-Tail (NET) trace formation method used
                 in HP Dynamo is an early example of such techniques.
                 Many existing trace formation schemes are variants of
                 NET. They work very well for most binary traces, but
                 they also suffer a major problem: the formed traces may
                 contain a large number of early exits that could be
                 branched out during the execution. If this happens
                 frequently, the program execution will spend more time
                 in the slow binary interpreter or in the unoptimized
                 code regions than in the optimized traces in code
                 cache. The benefit of the trace optimization is thus
                 lost. Traces/regions with frequently taken early-exits
                 are called delinquent traces/regions. Our empirical
                 study shows that at least 8 of the 12 SPEC CPU2006
                 integer benchmarks have delinquent traces. In this
                 paper, we propose a light-weight region formation
                 technique called Early-Exit Guided Region Formation
                 (EEG) to improve the quality of the formed
                 traces/regions. It iteratively identifies and merges
                 delinquent regions into larger code regions. We have
                 implemented our EEG algorithm in two LLVM-based
                 multi-threaded DBTs targeting ARM and IA32 instruction
                 set architecture (ISA), respectively. Using SPEC
                 CPU2006 benchmark suite with reference inputs, our
                 results show that compared to an NET-variant currently
                 used in QEMU, a state-of-the-art retargetable DBT, EEG
                 can achieve a significant performance improvement of up
                 to 72\% (27\% on average), and to 49\% (23\% on
                 average) for IA32 and ARM, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kaufmann:2013:SCO,
  author =       "Marco Kaufmann and Rainer G. Spallek",
  title =        "Superblock compilation and other optimization
                 techniques for a {Java}-based {DBT} machine emulator",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "33--40",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451521",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Superblock compilation techniques such as control flow
                 graph (CFG) or trace compilation have become a widely
                 adopted approach to increase the performance of
                 dynamically compiling virtual machines even further.
                 While this was shown to be successful for many
                 conventional virtual machines, it did not result in a
                 higher performance for Java-based DBT machine emulators
                 so far. These emulators dynamically translate
                 application binaries of a target machine into Java
                 bytecode, which is then eventually compiled into the
                 native code of the emulating host by the Java Virtual
                 Machine (JVM). Successful superblock compilation
                 techniques for this class of emulators must consider
                 the special requirements that result from the two-stage
                 translation as well as the characteristics of the JVM,
                 such as the inability of most Java JIT compilers to
                 handle large bytecode methods efficiently. In this
                 paper, we present a superblock compilation approach for
                 a Java-based DBT machine emulator that generates a
                 performance increase of up to 90 percent and of 32
                 percent on average. The key idea of our design is to
                 provide a large scope over the control flow of target
                 applications across basic block boundaries for the JVM,
                 while still keeping small bytecode methods for the
                 execution units. In addition, we also present two
                 further optimizations --- interpreter context
                 elimination and program counter elimination --- which
                 increase the emulation performance by 16 percent again.
                 In total, the optimization techniques discussed in this
                 paper provide an average performance gain of 48 percent
                 for the surveyed emulator.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jo:2013:ELM,
  author =       "Changyeon Jo and Erik Gustafsson and Jeongseok Son and
                 Bernhard Egger",
  title =        "Efficient live migration of virtual machines using
                 shared storage",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "41--50",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451524",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Live migration of virtual machines (VM) across
                 distinct physical hosts is an important feature of
                 virtualization technology for maintenance,
                 load-balancing and energy reduction, especially so for
                 data centers operators and cluster service providers.
                 Several techniques have been proposed to reduce the
                 downtime of the VM being transferred, often at the
                 expense of the total migration time. In this work, we
                 present a technique to reduce the total time required
                 to migrate a running VM from one host to another while
                 keeping the downtime to a minimum. Based on the
                 observation that modern operating systems use the
                 better part of the physical memory to cache data from
                 secondary storage, our technique tracks the VM's I/O
                 operations to the network-attached storage device and
                 maintains an updated mapping of memory pages that
                 currently reside in identical form on the storage
                 device. During the iterative pre-copy live migration
                 process, instead of transferring those pages from the
                 source to the target host, the memory-to-disk mapping
                 is sent to the target host which then fetches the
                 contents directly from the network-attached storage
                 device. We have implemented our approach into the Xen
                 hypervisor and ran a series of experiments with Linux
                 HVM guests. On average, the presented technique shows a
                 reduction of up over 30\% on average of the total
                 transfer time for a series of benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chiang:2013:IBM,
  author =       "Jui-Hao Chiang and Han-Lin Li and Tzi-cker Chiueh",
  title =        "Introspection-based memory de-duplication and
                 migration",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "51--62",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451525",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Memory virtualization abstracts a physical machine's
                 memory resource and presents to the virtual machines
                 running on it a piece of physical memory that could be
                 shared, compressed and moved. To optimize the memory
                 resource utilization by fully leveraging the
                 flexibility afforded by memory virtualization, it is
                 essential that the hypervisor have some sense of how
                 the guest VMs use their allocated physical memory. One
                 way to do this is virtual machine introspection (VMI),
                 which interprets byte values in a guest memory space
                 into semantically meaningful data structures. However,
                 identifying a guest VM's memory usage information such
                 as free memory pool is non-trivial. This paper
                 describes a bootstrapping VM introspection technique
                 that could accurately extract free memory pool
                 information from multiple versions of Windows and Linux
                 without kernel version-specific hard-coding, how to
                 apply this technique to improve the efficiency of
                 memory de-duplication and memory state migration, and
                 the resulting improvement in memory de-duplication
                 speed, gain in additional memory pages de-duplicated,
                 and reduction in traffic loads associated with memory
                 state migration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cui:2013:VMV,
  author =       "Lei Cui and Jianxin Li and Bo Li and Jinpeng Huai and
                 Chunming Hu and Tianyu Wo and Hussain Al-Aqrabi and Lu
                 Liu",
  title =        "{VMScatter}: migrate virtual machines to many hosts",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "63--72",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451528",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Live virtual machine migration is a technique often
                 used to migrate an entire OS with running applications
                 in a non-disruptive fashion. Prior works concerned with
                 one-to-one live migration with many techniques have
                 been proposed such as pre-copy, post-copy and
                 log/replay. In contrast, we propose VMScatter, a
                 one-to-many migration method to migrate virtual
                 machines from one to many other hosts simultaneously.
                 First, by merging the identical pages within or across
                 virtual machines, VMScatter multicasts only a single
                 copy of these pages to associated target hosts for
                 avoiding redundant transmission. This is impactful
                 practically when the same OS and similar applications
                 running in the virtual machines where there are plenty
                 of identical pages. Second, we introduce a novel
                 grouping algorithm to decide the placement of virtual
                 machines, distinguished from the previous schedule
                 algorithms which focus on the workload for load balance
                 or power saving, we also focus on network traffic,
                 which is a critical metric in data-intensive data
                 centers. Third, we schedule the multicast sequence of
                 packets to reduce the network overhead introduced by
                 joining or quitting the multicast groups of target
                 hosts. Compared to traditional live migration technique
                 in QEMU/KVM, VMScatter reduces 74.2\% of the total
                 transferred data, 69.1\% of the total migration time
                 and achieves the network traffic reduction from 50.1\%
                 to 70.3\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhou:2013:OVM,
  author =       "Ruijin Zhou and Fang Liu and Chao Li and Tao Li",
  title =        "Optimizing virtual machine live storage migration in
                 heterogeneous storage environment",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "73--84",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Virtual machine (VM) live storage migration techniques
                 significantly increase the mobility and manageability
                 of virtual machines in the era of cloud computing. On
                 the other hand, as solid state drives (SSDs) become
                 increasingly popular in data centers, VM live storage
                 migration will inevitably encounter heterogeneous
                 storage environments. Nevertheless, conventional
                 migration mechanisms do not consider the speed
                 discrepancy and SSD's wear-out issue, which not only
                 causes significant performance degradation but also
                 shortens SSD's lifetime. This paper, for the first
                 time, addresses the efficiency of VM live storage
                 migration in heterogeneous storage environments from a
                 multi-dimensional perspective, i.e., user experience,
                 device wearing, and manageability. We derive a flexible
                 metric (migration cost), which captures various design
                 preference. Based on that, we propose and prototype
                 three new storage migration strategies, namely: (1) Low
                 Redundancy (LR), which generates the least amount of
                 redundant writes; (2) Source-based Low Redundancy
                 (SLR), which keeps the balance between IO performance
                 and write redundancy; and (3) Asynchronous IO
                 Mirroring, which seeks the highest IO performance. The
                 evaluation of our prototyped system shows that our
                 techniques outperform existing live storage migration
                 by a significant margin. Furthermore, by adaptively
                 mixing our proposed schemes, the cost of massive VM
                 live storage migration can be even lower than that of
                 only using the best of individual mechanism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Song:2013:PLM,
  author =       "Xiang Song and Jicheng Shi and Ran Liu and Jian Yang
                 and Haibo Chen",
  title =        "Parallelizing live migration of virtual machines",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "85--96",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Live VM migration is one of the major primitive
                 operations to manage virtualized cloud platforms. Such
                 operation is usually mission-critical and disruptive to
                 the running services, and thus should be completed as
                 fast as possible. Unfortunately, with the increasing
                 amount of resources configured to a VM, such operations
                 are becoming increasingly time-consuming. In this
                 paper, we make a comprehensive analysis on the
                 parallelization opportunities of live VM migration on
                 two popular open-source VMMs (i.e., Xen and KVM). By
                 leveraging abundant resources like CPU cores and NICs
                 in contemporary server platforms, we design and
                 implement a system called PMigrate that leverages data
                 parallelism and pipeline parallelism to parallelize the
                 operation. As the parallelization framework requires
                 intensive mmap/munmap operations that tax the address
                 space management system in an operating system, we
                 further propose an abstraction called range lock, which
                 improves scalability of concurrent mutation to the
                 address space of an operating system (i.e., Linux) by
                 selectively replacing the per-process address space
                 lock inside kernel with dynamic and fine-grained range
                 locks that exclude costly operations on the requesting
                 address range from using the per-process lock.
                 Evaluation with our working prototype on Xen and KVM
                 shows that PMigrate accelerates the live VM migration
                 ranging from 2.49X to 9.88X, and decreases the downtime
                 ranging from 1.9X to 279.89X. Performance analysis
                 shows that our integration of range lock to Linux
                 significantly improves parallelism in mutating the
                 address space in VM migration and thus boosts the
                 performance ranging from 2.06X to 3.05X. We also show
                 that PMigrate makes only small disruption to other
                 co-hosted production VMs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fu:2013:EUD,
  author =       "Yangchun Fu and Zhiqiang Lin",
  title =        "{EXTERIOR}: using a dual-{VM} based external shell for
                 guest-{OS} introspection, configuration, and recovery",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "97--110",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "This paper presents EXTERIOR, a dual-VM architecture
                 based external shell that can be used for trusted,
                 timely out-of-VM management of guest-OS such as
                 introspection, configuration, and recovery. Inspired by
                 recent advances in virtual machine introspection (VMI),
                 EXTERIOR leverages an isolated, secure virtual machine
                 (SVM) to introspect the kernel state of a guest virtual
                 machine (GVM). However, it goes far beyond the
                 read-only capability of the traditional VMI, and can
                 perform automatic, fine-grained guest-OS writable
                 operations. The key idea of EXTERIOR is to use a
                 dual-VM architecture in which a SVM runs a kernel
                 identical to that of the GVM to create the necessary
                 environment for a running process (e.g., {\tt rmmod},
                 {\rr bkill}), and dynamically and transparently
                 redirect and update the memory state at the VMM layer
                 from SVM to GVM, thereby achieving the same effect in
                 terms of kernel state updates of running the same
                 trusted in-VM program inside the shell of GVM. A
                 proof-of-concept EXTERIOR has been implemented. The
                 experimental results show that EXTERIOR can be used for
                 a timely administration of guest-OS, including
                 introspection and (re)configuration of the guest-OS
                 state and timely response of kernel malware intrusions,
                 without any user account in the guest-OS.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dai:2013:LVM,
  author =       "Yuehua Dai and Yong Qi and Jianbao Ren and Yi Shi and
                 Xiaoguang Wang and Xuan Yu",
  title =        "A lightweight {VMM} on many core for high performance
                 computing",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "111--120",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451535",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Traditional Virtual Machine Monitor (VMM) virtualizes
                 some devices and instructions, which induces
                 performance overhead to guest operating systems.
                 Furthermore, the virtualization contributes a large
                 amount of codes to VMM, which makes a VMM prone to bugs
                 and vulnerabilities. On the other hand, in cloud
                 computing, cloud service provider configures virtual
                 machines based on requirements which are specified by
                 customers in advance. As resources in a multi-core
                 server increase to more than adequate in the future,
                 virtualization is not necessary although it provides
                 convenience for cloud computing. Based on the above
                 observations, this paper presents an alternative way
                 for constructing a VMM: configuring a booting interface
                 instead of virtualization technology. A lightweight
                 virtual machine monitor --- OSV is proposed based on
                 this idea. OSV can host multiple full functional Linux
                 kernels with little performance overhead. There are
                 only 6 hyper-calls in OSV. The Linux running on top of
                 OSV is intercepted only for the inter-processor
                 interrupts. The resource isolation is implemented with
                 hardware-assist virtualization. The resource sharing is
                 controlled by distributed protocols embedded in current
                 operating systems. We implement a prototype of OSV on
                 AMD Opteron processor based 32-core servers with SVM
                 and cache-coherent NUMA architectures. OSV can host up
                 to 8 Linux kernels on the server with less than 10
                 lines of code modifications to Linux kernel. OSV has
                 about 8000 lines of code which can be easily tuned and
                 debugged. The experiment results show that OSV VMM has
                 23.7\% performance improvement compared with Xen VMM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yamada:2013:TFT,
  author =       "Hiroshi Yamada and Kenji Kono",
  title =        "Traveling forward in time to newer operating systems
                 using {ShadowReboot}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "121--130",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Operating system (OS) reboots are an essential part of
                 updating kernels and applications on laptops and
                 desktop PCs. Long downtime during OS reboots severely
                 disrupts users' computational activities. This long
                 disruption discourages the users from conducting OS
                 reboots, failing to enforce them to conduct software
                 updates. This paper presents ShadowReboot, a virtual
                 machine monitor (VMM)-based approach that shortens
                 downtime of OS reboots in software updates.
                 ShadowReboot conceals OS reboot activities from user's
                 applications by spawning a VM dedicated to an OS reboot
                 and systematically producing the rebooted state where
                 the updated kernel and applications are ready for use.
                 ShadowReboot provides an illusion to the users that the
                 guest OS travels forward in time to the rebooted state.
                 ShadowReboot offers the following advantages. It can be
                 used to apply patches to the kernels and even system
                 configuration updates. Next, it does not require any
                 special patch requiring detailed knowledge about the
                 target kernels. Lastly, it does not require any target
                 kernel modification. We implemented a prototype in
                 VirtualBox 4.0.10 OSE. Our experimental results show
                 that ShadowReboot successfully updated software on
                 unmodified commodity OS kernels and shortened the
                 downtime of commodity OS reboots on five Linux
                 distributions (Fedora, Ubuntu, Gentoo, Cent, and SUSE)
                 by 91 to 98\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jantz:2013:PPO,
  author =       "Michael R. Jantz and Prasad A. Kulkarni",
  title =        "Performance potential of optimization phase selection
                 during dynamic {JIT} compilation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "131--142",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Phase selection is the process of customizing the
                 applied set of compiler optimization phases for
                 individual functions or programs to improve performance
                 of generated code. Researchers have recently developed
                 novel feature-vector based heuristic techniques to
                 perform phase selection during online JIT compilation.
                 While these heuristics improve program startup speed,
                 steady-state performance was not seen to benefit over
                 the default fixed single sequence baseline.
                 Unfortunately, it is still not conclusively known
                 whether this lack of steady-state performance gain is
                 due to a failure of existing online phase selection
                 heuristics, or because there is, indeed, little or no
                 speedup to be gained by phase selection in online JIT
                 environments. The goal of this work is to resolve this
                 question, while examining the phase selection related
                 behavior of optimizations, and assessing and improving
                 the effectiveness of existing heuristic solutions. We
                 conduct experiments to find and understand the potency
                 of the factors that can cause the phase selection
                 problem in JIT compilers. Next, using long-running
                 genetic algorithms we determine that program-wide and
                 method-specific phase selection in the HotSpot JIT
                 compiler can produce ideal steady-state performance
                 gains of up to 15\% (4.3\% average) and 44\% (6.2\%
                 average) respectively. We also find that existing
                 state-of-the-art heuristic solutions are unable to
                 realize these performance gains (in our experimental
                 setup), discuss possible causes, and show that
                 exploiting knowledge of optimization phase behavior can
                 help improve such heuristic solutions. Our work
                 develops a robust open-source production-quality
                 framework using the HotSpot JVM to further explore this
                 problem in the future.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lameed:2013:MAS,
  author =       "Nurudeen A. Lameed and Laurie J. Hendren",
  title =        "A modular approach to on-stack replacement in {LLVM}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "143--154",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451541",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "On-stack replacement (OSR) is a technique that allows
                 a virtual machine to interrupt running code during the
                 execution of a function/method, to re-optimize the
                 function on-the-fly using an optimizing JIT compiler,
                 and then to resume the interrupted function at the
                 point and state at which it was interrupted. OSR is
                 particularly useful for programs with potentially
                 long-running loops, as it allows dynamic optimization
                 of those loops as soon as they become hot. This paper
                 presents a modular approach to implementing OSR for the
                 LLVM compiler infrastructure. This is an important step
                 forward because LLVM is gaining popular support, and
                 adding the OSR capability allows compiler developers to
                 develop new dynamic techniques. In particular, it will
                 enable more sophisticated LLVM-based JIT compiler
                 approaches. Indeed, other compiler/VM developers can
                 use our approach because it is a clean modular addition
                 to the standard LLVM distribution. Further, our
                 approach is defined completely at the LLVM-IR level and
                 thus does not require any modifications to the target
                 code generation. The OSR implementation can be used by
                 different compilers to support a variety of dynamic
                 optimizations. As a demonstration of our OSR approach,
                 we have used it to support dynamic inlining in McVM.
                 McVM is a virtual machine for MATLAB which uses a
                 LLVM-based JIT compiler. MATLAB is a popular dynamic
                 language for scientific and engineering applications
                 that typically manipulate large matrices and often
                 contain long-running loops, and is thus an ideal target
                 for dynamic JIT compilation and OSRs. Using our McVM
                 example, we demonstrate reasonable overheads for our
                 benchmark set, and performance improvements when using
                 it to perform dynamic inlining.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jantz:2013:FAG,
  author =       "Michael R. Jantz and Carl Strickland and Karthik Kumar
                 and Martin Dimitrov and Kshitij A. Doshi",
  title =        "A framework for application guidance in virtual memory
                 systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "155--166",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451543",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "This paper proposes a collaborative approach in which
                 applications can provide guidance to the operating
                 system regarding allocation and recycling of physical
                 memory. The operating system incorporates this guidance
                 to decide which physical page should be used to back a
                 particular virtual page. The key intuition behind this
                 approach is that application software, as a generator
                 of memory accesses, is best equipped to inform the
                 operating system about the relative access rates and
                 overlapping patterns of usage of its own address space.
                 It is also capable of steering its own algorithms in
                 order to keep its dynamic memory footprint under check
                 when there is a need to reduce power or to contain the
                 spillover effects from bursts in demand. Application
                 software, working cooperatively with the operating
                 system, can therefore help the latter schedule memory
                 more effectively and efficiently than when the
                 operating system is forced to act alone without such
                 guidance. It is particularly difficult to achieve power
                 efficiency without application guidance since power
                 expended in memory is a function not merely of the
                 intensity with which memory is accessed in time but
                 also how many physical ranks are affected by an
                 application's memory usage. Our framework introduces an
                 abstraction called ``colors'' for the application to
                 communicate its intent to the operating system. We
                 modify the operating system to receive this
                 communication in an efficient way, and to organize
                 physical memory pages into intermediate level grouping
                 structures called ``trays'' which capture the
                 physically independent access channels and self-refresh
                 domains, so that it can apply this guidance without
                 entangling the application in lower level details of
                 power or bandwidth management. This paper describes how
                 we re-architect the memory management of a recent Linux
                 kernel to realize a three way collaboration between
                 hardware, supervisory software, and application
                 tasks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2013:TVR,
  author =       "Chen Chen and Petros Maniatis and Adrian Perrig and
                 Amit Vasudevan and Vyas Sekar",
  title =        "Towards verifiable resource accounting for outsourced
                 computation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "167--178",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451546",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Outsourced computation services should ideally only
                 charge customers for the resources used by their
                 applications. Unfortunately, no verifiable basis for
                 service providers and customers to reconcile resource
                 accounting exists today. This leads to undesirable
                 outcomes for both providers and consumers-providers
                 cannot prove to customers that they really devoted the
                 resources charged, and customers cannot verify that
                 their invoice maps to their actual usage. As a result,
                 many practical and theoretical attacks exist, aimed at
                 charging customers for resources that their
                 applications did not consume. Moreover, providers
                 cannot charge consumers precisely, which causes them to
                 bear the cost of unaccounted resources or pass these
                 costs inefficiently to their customers. We introduce
                 ALIBI, a first step toward a vision for verifiable
                 resource accounting. ALIBI places a minimal, trusted
                 reference monitor underneath the service provider's
                 software platform. This monitor observes resource
                 allocation to customers' guest virtual machines and
                 reports those observations to customers, for verifiable
                 reconciliation. In this paper, we show that ALIBI
                 efficiently and verifiably tracks guests' memory use
                 and CPU-cycle consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhou:2013:LPC,
  author =       "Ruijin Zhou and Tao Li",
  title =        "Leveraging phase change memory to achieve efficient
                 virtual machine execution",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "179--190",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451547",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "Virtualization technology is being widely adopted by
                 servers and data centers in the cloud computing era to
                 improve resource utilization and energy efficiency.
                 Nevertheless, the heterogeneous memory demands from
                 multiple virtual machines (VM) make it more challenging
                 to design efficient memory systems. Even worse, mission
                 critical VM management activities (e.g. checkpointing)
                 could incur significant runtime overhead due to
                 intensive IO operations. In this paper, we propose to
                 leverage the adaptable and non-volatile features of the
                 emerging phase change memory (PCM) to achieve efficient
                 virtual machine execution. Towards this end, we exploit
                 VM-aware PCM management mechanisms, which (1) smartly
                 tune SLC/MLC page allocation within a single VM and
                 across different VMs and (2) keep critical
                 checkpointing pages in PCM to reduce I/O traffic.
                 Experimental results show that our single VM design
                 (IntraVM) improves performance by 10\% and 20\%
                 compared to pure SLC- and MLC- based systems. Further
                 incorporating VM-aware resource management schemes
                 (IntraVM+InterVM) increases system performance by 15\%.
                 In addition, our design saves 46\% of
                 checkpoint/restore duration and reduces 50\% of overall
                 IO penalty to the system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ouyang:2013:PTS,
  author =       "Jiannan Ouyang and John R. Lange",
  title =        "Preemptable ticket spinlocks: improving consolidated
                 performance in the cloud",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "7",
  pages =        "191--200",
  month =        jul,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517326.2451549",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:55:17 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "VEE '13 Conference proceedings.",
  abstract =     "When executing inside a virtual machine environment,
                 OS level synchronization primitives are faced with
                 significant challenges due to the scheduling behavior
                 of the underlying virtual machine monitor. Operations
                 that are ensured to last only a short amount of time on
                 real hardware, are capable of taking considerably
                 longer when running virtualized. This change in
                 assumptions has significant impact when an OS is
                 executing inside a critical region that is protected by
                 a spinlock. The interaction between OS level spinlocks
                 and VMM scheduling is known as the Lock Holder
                 Preemption problem and has a significant impact on
                 overall VM performance. However, with the use of ticket
                 locks instead of generic spinlocks, virtual
                 environments must also contend with waiters being
                 preempted before they are able to acquire the lock.
                 This has the effect of blocking access to a lock, even
                 if the lock itself is available. We identify this
                 scenario as the Lock Waiter Preemption problem. In
                 order to solve both problems we introduce Preemptable
                 Ticket spinlocks, a new locking primitive that is
                 designed to enable a VM to always make forward progress
                 by relaxing the ordering guarantees offered by ticket
                 locks. We show that the use of Preemptable Ticket
                 spinlocks improves VM performance by 5.32X on average,
                 when running on a non paravirtual VMM, and by 7.91X
                 when running on a VMM that supports a paravirtual
                 locking interface, when executing a set of
                 microbenchmarks as well as a realistic e-commerce
                 benchmark.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yang:2013:PSC,
  author =       "Chao Yang and Wei Xue and Haohuan Fu and Lin Gan and
                 Linfeng Li and Yangtong Xu and Yutong Lu and Jiachang
                 Sun and Guangwen Yang and Weimin Zheng",
  title =        "A peta-scalable {CPU-GPU} algorithm for global
                 atmospheric simulations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "1--12",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Developing highly scalable algorithms for global
                 atmospheric modeling is becoming increasingly important
                 as scientists inquire to understand behaviors of the
                 global atmosphere at extreme scales. Nowadays,
                 heterogeneous architecture based on both processors and
                 accelerators is becoming an important solution for
                 large-scale computing. However, large-scale simulation
                 of the global atmosphere brings a severe challenge to
                 the development of highly scalable algorithms that fit
                 well into state-of-the-art heterogeneous systems.
                 Although successes have been made on GPU-accelerated
                 computing in some top-level applications, studies on
                 fully exploiting heterogeneous architectures in global
                 atmospheric modeling are still very less to be seen,
                 due in large part to both the computational
                 difficulties of the mathematical models and the
                 requirement of high accuracy for long term simulations.
                 In this paper, we propose a peta-scalable hybrid
                 algorithm that is successfully applied in a
                 cubed-sphere shallow-water model in global atmospheric
                 simulations. We employ an adjustable partition between
                 CPUs and GPUs to achieve a balanced utilization of the
                 entire hybrid system, and present a pipe-flow scheme to
                 conduct conflict-free inter-node communication on the
                 cubed-sphere geometry and to maximize
                 communication-computation overlap. Systematic
                 optimizations for multithreading on both GPU and CPU
                 sides are performed to enhance computing throughput and
                 improve memory efficiency. Our experiments demonstrate
                 nearly ideal strong and weak scalabilities on up to
                 3,750 nodes of the Tianhe-1A. The largest run sustains
                 a performance of 0.8 Pflops in double precision (32\%
                 of the peak performance), using 45,000 CPU cores and
                 3,750 GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lifflander:2013:APF,
  author =       "Jonathan Lifflander and Phil Miller and Laxmikant
                 Kale",
  title =        "Adoption protocols for fanout-optimal fault-tolerant
                 termination detection",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "13--22",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442519",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Termination detection is relevant for signaling
                 completion (all processors are idle and no messages are
                 in flight) of many operations in distributed systems,
                 including work stealing algorithms, dynamic data
                 exchange, and dynamically structured computations. In
                 the face of growing supercomputers with increasing
                 likelihood that each job may encounter faults, it is
                 important for high-performance computing applications
                 that rely on termination detection that such an
                 algorithm be able to tolerate the inevitable faults. We
                 provide a trio of new practical fault tolerance schemes
                 for a standard approach to termination detection that
                 are easy to implement, present low overhead in both
                 theory and practice, and have scalable costs when
                 recovering from faults. These schemes tolerate all
                 single-process faults, and are probabilistically
                 tolerant of faults affecting multiple processes. We
                 combine the theoretical failure probabilities we can
                 calculate for each algorithm with historical fault
                 records from real machines to show that these
                 algorithms have excellent overall survivability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yuki:2013:ADA,
  author =       "Tomofumi Yuki and Paul Feautrier and Sanjay Rajopadhye
                 and Vijay Saraswat",
  title =        "Array dataflow analysis for polyhedral {X10}
                 programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "23--34",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442520",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "This paper addresses the static analysis of an
                 important class of X10 programs, namely those with
                 finish/async parallelism, and affine loops and array
                 reference structure as in the polyhedral model. For
                 such programs our analysis can certify whenever a
                 program is deterministic or flags races. Our key
                 contributions are (i) adaptation of array dataflow
                 analysis from the polyhedral model to programs with
                 finish/async parallelism, and (ii) use of the array
                 dataflow analysis result to certify determinacy. We
                 distinguish our work from previous approaches by
                 combining the precise statement instance-wise and array
                 element-wise analysis capability of the polyhedral
                 model with finish/async programs that are more
                 expressive than DOALL parallelism commonly considered
                 in the polyhedral literature. We show that our approach
                 is exact (no false negative/positives) and more precise
                 than previous approaches, but is limited to programs
                 that fit the polyhedral model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Prountzos:2013:BCA,
  author =       "Dimitrios Prountzos and Keshav Pingali",
  title =        "Betweenness centrality: algorithms and
                 implementations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "35--46",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442521",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Betweenness centrality is an important metric in the
                 study of social networks, and several algorithms for
                 computing this metric exist in the literature. This
                 paper makes three contributions. First, we show that
                 the problem of computing betweenness centrality can be
                 formulated abstractly in terms of a small set of
                 operators that update the graph. Second, we show that
                 existing parallel algorithms for computing betweenness
                 centrality can be viewed as implementations of
                 different schedules for these operators, permitting all
                 these algorithms to be formulated in a single
                 framework. Third, we derive a new asynchronous parallel
                 algorithm for betweenness centrality that (i) works
                 seamlessly for both weighted and unweighted graphs,
                 (ii) can be applied to large graphs, and (iii) is able
                 to extract large amounts of parallelism. We implemented
                 this algorithm and compared it against a number of
                 publicly available implementations of previous
                 algorithms on two different multicore architectures.
                 Our results show that the new algorithm is the best
                 performing one in most cases, particularly for large
                 graphs and large thread counts, and is always
                 competitive against other algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Xiang:2013:CAM,
  author =       "Lingxiang Xiang and Michael Lee Scott",
  title =        "Compiler aided manual speculation for high performance
                 concurrent data structures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "47--56",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Speculation is a well-known means of increasing
                 parallelism among concurrent methods that are usually
                 but not always independent. Traditional nonblocking
                 data structures employ a particularly restrictive form
                 of speculation. Software transactional memory (STM)
                 systems employ a much more general---though typically
                 blocking---form, and there is a wealth of options in
                 between. Using several different concurrent data
                 structures as examples, we show that manual addition of
                 speculation to traditional lock-based code can lead to
                 significant performance improvements. Successful
                 speculation requires careful consideration of
                 profitability, and of how and when to validate
                 consistency. Unfortunately, it also requires
                 substantial modifications to code structure and a deep
                 understanding of the memory model. These latter
                 requirements make it difficult to use in its purely
                 manual form, even for expert programmers. To simplify
                 the process, we present a compiler tool, CSpec, that
                 automatically generates speculative code from baseline
                 lock-based code with user annotations. Compiler-aided
                 manual speculation keeps the original code structure
                 for better readability and maintenance, while providing
                 the flexibility to chose speculation and validation
                 strategies. Experiments on UltraSPARC and x86 platforms
                 demonstrate that with a small number annotations added
                 to lock-based code, CSpec can generate speculative code
                 that matches the performance of best-effort
                 hand-written versions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wu:2013:CAA,
  author =       "Bo Wu and Zhijia Zhao and Eddy Zheng Zhang and Yunlian
                 Jiang and Xipeng Shen",
  title =        "Complexity analysis and algorithm design for
                 reorganizing data to minimize non-coalesced memory
                 accesses on {GPU}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "57--68",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442523",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "The performance of Graphic Processing Units (GPU) is
                 sensitive to irregular memory references. Some recent
                 work shows the promise of data reorganization for
                 eliminating non-coalesced memory accesses that are
                 caused by irregular references. However, all previous
                 studies have employed simple, heuristic methods to
                 determine the new data layouts to create. As a result,
                 they either do not provide any performance guarantee or
                 are effective to only some limited scenarios. This
                 paper contributes a fundamental study to the problem.
                 It systematically analyzes the inherent complexity of
                 the problem in various settings, and for the first
                 time, proves that the problem is NP-complete. It then
                 points out the limitations of existing techniques and
                 reveals that in practice, the essence for designing an
                 appropriate data reorganization algorithm can be
                 reduced to a tradeoff among space, time, and
                 complexity. Based on that insight, it develops two new
                 data reorganization algorithms to overcome the
                 limitations of previous methods. Experiments show that
                 an assembly composed of the new algorithms and a
                 previous algorithm can circumvent the inherent
                 complexity in finding optimal data layouts, making it
                 feasible to minimize non-coalesced memory accesses for
                 a variety of irregular applications and settings that
                 are beyond the reach of existing techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Le:2013:CEW,
  author =       "Nhat Minh L{\^e} and Antoniu Pop and Albert Cohen and
                 Francesco Zappa Nardelli",
  title =        "Correct and efficient work-stealing for weak memory
                 models",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "69--80",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442524",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Chase and Lev's concurrent deque is a key data
                 structure in shared-memory parallel programming and
                 plays an essential role in work-stealing schedulers. We
                 provide the first correctness proof of an optimized
                 implementation of Chase and Lev's deque on top of the
                 POWER and ARM architectures: these provide very relaxed
                 memory models, which we exploit to improve performance
                 but considerably complicate the reasoning. We also
                 study an optimized x86 and a portable C11
                 implementation, conducting systematic experiments to
                 evaluate the impact of memory barrier optimizations.
                 Our results demonstrate the benefits of hand tuning the
                 deque code when running on top of relaxed memory
                 models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bergstrom:2013:DOF,
  author =       "Lars Bergstrom and Matthew Fluet and Mike Rainey and
                 John Reppy and Stephen Rosen and Adam Shaw",
  title =        "Data-only flattening for nested data parallelism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "81--92",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442525",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Data parallelism has proven to be an effective
                 technique for high-level programming of a certain class
                 of parallel applications, but it is not well suited to
                 irregular parallel computations. Blelloch and others
                 proposed nested data parallelism (NDP) as a language
                 mechanism for programming irregular parallel
                 applications in a declarative data-parallel style. The
                 key to this approach is a compiler transformation that
                 flattens the NDP computation and data structures into a
                 form that can be executed efficiently on a wide-vector
                 SIMD architecture. Unfortunately, this technique is ill
                 suited to execution on today's multicore machines. We
                 present a new technique, called data-only flattening,
                 for the compilation of NDP, which is suitable for
                 multicore architectures. Data-only flattening
                 transforms nested data structures in order to expose
                 programs to various optimizations while leaving control
                 structures intact. We present a formal semantics of
                 data-only flattening in a core language with a
                 rewriting system. We demonstrate the effectiveness of
                 this technique in the Parallel ML implementation and we
                 report encouraging experimental results across various
                 benchmark applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Morozov:2013:DMT,
  author =       "Dmitriy Morozov and Gunther Weber",
  title =        "Distributed merge trees",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "93--102",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442526",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Improved simulations and sensors are producing
                 datasets whose increasing complexity exhausts our
                 ability to visualize and comprehend them directly. To
                 cope with this problem, we can detect and extract
                 significant features in the data and use them as the
                 basis for subsequent analysis. Topological methods are
                 valuable in this context because they provide robust
                 and general feature definitions. As the growth of
                 serial computational power has stalled, data analysis
                 is becoming increasingly dependent on massively
                 parallel machines. To satisfy the computational demand
                 created by complex datasets, algorithms need to
                 effectively utilize these computer architectures. The
                 main strength of topological methods, their emphasis on
                 global information, turns into an obstacle during
                 parallelization. We present two approaches to alleviate
                 this problem. We develop a distributed representation
                 of the merge tree that avoids computing the global tree
                 on a single processor and lets us parallelize
                 subsequent queries. To account for the increasing
                 number of cores per processor, we develop a new data
                 structure that lets us take advantage of multiple
                 shared-memory cores to parallelize the work on a single
                 node. Finally, we present experiments that illustrate
                 the strengths of our approach as well as help identify
                 future challenges.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Morrison:2013:FCQ,
  author =       "Adam Morrison and Yehuda Afek",
  title =        "Fast concurrent queues for x86 processors",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "103--112",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442527",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Conventional wisdom in designing concurrent data
                 structures is to use the most powerful synchronization
                 primitive, namely compare-and-swap (CAS), and to avoid
                 contended hot spots. In building concurrent FIFO
                 queues, this reasoning has led researchers to propose
                 combining-based concurrent queues. This paper takes a
                 different approach, showing how to rely on
                 fetch-and-add (F\&A), a less powerful primitive that is
                 available on x86 processors, to construct a nonblocking
                 (lock-free) linearizable concurrent FIFO queue which,
                 despite the F\&A being a contended hot spot,
                 outperforms combining-based implementations by 1.5x to
                 2.5x in all concurrency levels on an x86 server with
                 four multicore processors, in both single-processor and
                 multi-processor executions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wamhoff:2013:FIP,
  author =       "Jons-Tobias Wamhoff and Christof Fetzer and Pascal
                 Felber and Etienne Rivi{\`e}re and Gilles Muller",
  title =        "{FastLane}: improving performance of software
                 transactional memory for low thread counts",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "113--122",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442528",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Software transactional memory (STM) can lead to
                 scalable implementations of concurrent programs, as the
                 relative performance of an application increases with
                 the number of threads that support it. However, the
                 absolute performance is typically impaired by the
                 overheads of transaction management and instrumented
                 accesses to shared memory. This often leads STM-based
                 programs with low thread counts to perform worse than a
                 sequential, non-instrumented version of the same
                 application. In this paper, we propose FastLane, a new
                 STM algorithm that bridges the performance gap between
                 sequential execution and classical STM algorithms when
                 running on few cores. FastLane seeks to reduce
                 instrumentation costs and thus performance degradation
                 in its target operation range. We introduce a novel
                 algorithm that differentiates between two types of
                 threads: One thread (the master) executes transactions
                 pessimistically without ever aborting, thus with
                 minimal instrumentation and management costs, while
                 other threads (the helpers) can commit speculative
                 transactions only when they do not conflict with the
                 master. Helpers thus contribute to the application
                 progress without impairing on the performance of the
                 master. We implement FastLane as an extension of a
                 state-of-the-art STM runtime system and compiler.
                 Multiple code paths are produced for execution on a
                 single, few, and many cores. The runtime system selects
                 the code path providing the best throughput, depending
                 on the number of cores available on the target machine.
                 Evaluation results indicate that our approach provides
                 promising performance at low thread counts: FastLane
                 almost systematically wins over a classical STM in the
                 1-6 threads range, and often performs better than
                 sequential execution of the non-instrumented version of
                 the same application starting with 2 threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Barthe:2013:RVS,
  author =       "Gilles Barthe and Juan Manuel Crespo and Sumit Gulwani
                 and Cesar Kunz and Mark Marron",
  title =        "From relational verification to {SIMD} loop
                 synthesis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "123--134",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Existing pattern-based compiler technology is unable
                 to effectively exploit the full potential of SIMD
                 architectures. We present a new program synthesis based
                 technique for auto-vectorizing performance critical
                 innermost loops. Our synthesis technique is applicable
                 to a wide range of loops, consistently produces
                 performant SIMD code, and generates correctness proofs
                 for the output code. The synthesis technique, which
                 leverages existing work on relational verification
                 methods, is a novel combination of deductive loop
                 restructuring, synthesis condition generation and a new
                 inductive synthesis algorithm for producing loop-free
                 code fragments. The inductive synthesis algorithm wraps
                 an optimized depth-first exploration of code sequences
                 inside a CEGIS loop. Our technique is able to quickly
                 produce SIMD implementations (up to 9 instructions in
                 0.12 seconds) for a wide range of fundamental looping
                 structures. The resulting SIMD implementations
                 outperform the original loops by 2.0x-3.7x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shun:2013:LLG,
  author =       "Julian Shun and Guy E. Blelloch",
  title =        "{Ligra}: a lightweight graph processing framework for
                 shared memory",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "135--146",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442530",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "There has been significant recent interest in parallel
                 frameworks for processing graphs due to their
                 applicability in studying social networks, the Web
                 graph, networks in biology, and unstructured meshes in
                 scientific simulation. Due to the desire to process
                 large graphs, these systems have emphasized the ability
                 to run on distributed memory machines. Today, however,
                 a single multicore server can support more than a
                 terabyte of memory, which can fit graphs with tens or
                 even hundreds of billions of edges. Furthermore, for
                 graph algorithms, shared-memory multicores are
                 generally significantly more efficient on a per core,
                 per dollar, and per joule basis than distributed memory
                 systems, and shared-memory algorithms tend to be
                 simpler than their distributed counterparts. In this
                 paper, we present a lightweight graph processing
                 framework that is specific for shared-memory
                 parallel/multicore machines, which makes graph
                 traversal algorithms easy to write. The framework has
                 two very simple routines, one for mapping over edges
                 and one for mapping over vertices. Our routines can be
                 applied to any subset of the vertices, which makes the
                 framework useful for many graph traversal algorithms
                 that operate on subsets of the vertices. Based on
                 recent ideas used in a very fast algorithm for
                 breadth-first search (BFS), our routines automatically
                 adapt to the density of vertex sets. We implement
                 several algorithms in this framework, including BFS,
                 graph radii estimation, graph connectivity, betweenness
                 centrality, PageRank and single-source shortest paths.
                 Our algorithms expressed using this framework are very
                 simple and concise, and perform almost as well as
                 highly optimized code. Furthermore, they get good
                 speedups on a 40-core machine and are significantly
                 more efficient than previously reported results using
                 graph frameworks on machines with many more cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nasre:2013:MAG,
  author =       "Rupesh Nasre and Martin Burtscher and Keshav Pingali",
  title =        "Morph algorithms on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "147--156",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "There is growing interest in using GPUs to accelerate
                 graph algorithms such as breadth-first search,
                 computing page-ranks, and finding shortest paths.
                 However, these algorithms do not modify the graph
                 structure, so their implementation is relatively easy
                 compared to general graph algorithms like mesh
                 generation and refinement, which morph the underlying
                 graph in non-trivial ways by adding and removing nodes
                 and edges. We know relatively little about how to
                 implement morph algorithms efficiently on GPUs. In this
                 paper, we present and study four morph algorithms: (i)
                 a computational geometry algorithm called Delaunay Mesh
                 Refinement (DMR), (ii) an approximate SAT solver called
                 Survey Propagation (SP), (iii) a compiler analysis
                 called Points-To Analysis (PTA), and (iv) Boruvka's
                 Minimum Spanning Tree algorithm (MST). Each of these
                 algorithms modifies the graph data structure in
                 different ways and thus poses interesting challenges.
                 We overcome these challenges using algorithmic and
                 GPU-specific optimizations. We propose efficient
                 techniques to perform concurrent subgraph addition,
                 subgraph deletion, conflict detection and several
                 optimizations to improve the scalability of morph
                 algorithms. For an input mesh with 10 million
                 triangles, our DMR code achieves an 80x speedup over
                 the highly optimized serial Triangle program and a 2.3x
                 speedup over a multicore implementation running with 48
                 threads. Our SP code is 3x faster than a multicore
                 implementation with 48 threads on an input with 1
                 million literals. The PTA implementation is able to
                 analyze six SPEC 2000 benchmark programs in just 74
                 milliseconds, achieving a geometric mean speedup of
                 9.3x over a 48-thread multicore version. Our MST code
                 is slower than a multicore version with 48 threads for
                 sparse graphs but significantly faster for denser
                 graphs. This work provides several insights into how
                 other morph algorithms can be efficiently implemented
                 on GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Calciu:2013:NAR,
  author =       "Irina Calciu and Dave Dice and Yossi Lev and Victor
                 Luchangco and Virendra J. Marathe and Nir Shavit",
  title =        "{NUMA}-aware reader-writer locks",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "157--166",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442532",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Non-Uniform Memory Access (NUMA) architectures are
                 gaining importance in mainstream computing systems due
                 to the rapid growth of multi-core multi-chip machines.
                 Extracting the best possible performance from these new
                 machines will require us to revisit the design of the
                 concurrent algorithms and synchronization primitives
                 which form the building blocks of many of today's
                 applications. This paper revisits one such critical
                 synchronization primitive --- the reader-writer lock.
                 We present what is, to the best of our knowledge, the
                 first family of reader-writer lock algorithms tailored
                 to NUMA architectures. We present several variations
                 which trade fairness between readers and writers for
                 higher concurrency among readers and better
                 back-to-back batching of writers from the same NUMA
                 node. Our algorithms leverage the lock cohorting
                 technique to manage synchronization between writers in
                 a NUMA-friendly fashion, binary flags to coordinate
                 readers and writers, and simple distributed reader
                 counter implementations to enable NUMA-friendly
                 concurrency among readers. The end result is a
                 collection of surprisingly simple NUMA-aware algorithms
                 that outperform the state-of-the-art reader-writer
                 locks by up to a factor of 10 in our microbenchmark
                 experiments. To evaluate our algorithms in a realistic
                 setting we also present performance results of the {\tt
                 kccachetest} benchmark of the Kyoto-Cabinet
                 distribution, an open-source database which makes heavy
                 use of pthread reader-writer locks. Our locks boost the
                 performance of {\tt kccachetest} by up to 40\% over the
                 best prior alternatives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2013:OAO,
  author =       "Zizhong Chen",
  title =        "{Online-ABFT}: an online algorithm based fault
                 tolerance scheme for soft error detection in iterative
                 methods",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "167--176",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442533",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Soft errors are one-time events that corrupt the state
                 of a computing system but not its overall
                 functionality. Large supercomputers are especially
                 susceptible to soft errors because of their large
                 number of components. Soft errors can generally be
                 detected offline through the comparison of the final
                 computation results of two duplicated computations, but
                 this approach often introduces significant overhead.
                 This paper presents Online-ABFT, a simple but efficient
                 online soft error detection technique that can detect
                 soft errors in the widely used Krylov subspace
                 iterative methods in the middle of the program
                 execution so that the computation efficiency can be
                 improved through the termination of the corrupted
                 computation in a timely manner soon after a soft error
                 occurs. Based on a simple verification of orthogonality
                 and residual, Online-ABFT is easy to implement and
                 highly efficient. Experimental results demonstrate
                 that, when this online error detection approach is used
                 together with checkpointing, it improves the time to
                 obtain correct results by up to several orders of
                 magnitude over the traditional offline approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Friedley:2013:OPE,
  author =       "Andrew Friedley and Torsten Hoefler and Greg
                 Bronevetsky and Andrew Lumsdaine and Ching-Chen Ma",
  title =        "Ownership passing: efficient distributed memory
                 programming on multi-core systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "177--186",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "The number of cores in multi- and many-core
                 high-performance processors is steadily increasing.
                 MPI, the de-facto standard for programming
                 high-performance computing systems offers a distributed
                 memory programming model. MPI's semantics force a copy
                 from one process' send buffer to another process'
                 receive buffer. This makes it difficult to achieve the
                 same performance on modern hardware than shared memory
                 programs which are arguably harder to maintain and
                 debug. We propose generalizing MPI's communication
                 model to include ownership passing, which make it
                 possible to fully leverage the shared memory hardware
                 of multi- and many-core CPUs to stream communicated
                 data concurrently with the receiver's computations on
                 it. The benefits and simplicity of message passing are
                 retained by extending MPI with calls to send (pass)
                 ownership of memory regions, instead of their contents,
                 between processes. Ownership passing is achieved with a
                 hybrid MPI implementation that runs MPI processes as
                 threads and is mostly transparent to the user. We
                 propose an API and a static analysis technique to
                 transform legacy MPI codes automatically and
                 transparently to the programmer, demonstrating that
                 this scheme is easy to use in practice. Using the
                 ownership passing technique, we see up to 51\%
                 communication speedups over a standard message passing
                 implementation on state-of-the art multicore systems.
                 Our analysis and interface will lay the groundwork for
                 future development of MPI-aware optimizing compilers
                 and multi-core specific optimizations, which will be
                 key for success in current and next-generation
                 computing platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Meyerovich:2013:PSS,
  author =       "Leo A. Meyerovich and Matthew E. Torok and Eric
                 Atkinson and Rastislav Bodik",
  title =        "Parallel schedule synthesis for attribute grammars",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "187--196",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442535",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "We examine how to synthesize a parallel schedule of
                 structured traversals over trees. In our system,
                 programs are declaratively specified as attribute
                 grammars. Our synthesizer automatically, correctly, and
                 quickly schedules the attribute grammar as a
                 composition of parallel tree traversals. Our downstream
                 compiler optimizes for GPUs and multicore CPUs. We
                 provide support for designing efficient schedules.
                 First, we introduce a declarative language of schedules
                 where programmers may constrain any part of the
                 schedule and the synthesizer will complete and autotune
                 the rest. Furthermore, the synthesizer answers
                 debugging queries about how schedules may be completed.
                 We evaluate our approach with two case studies. First,
                 we created the first parallel schedule for a large
                 fragment of CSS and report a 3X multicore speedup.
                 Second, we created an interactive GPU-accelerated
                 animation of over 100,000 nodes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Deo:2013:PSA,
  author =       "Mrinal Deo and Sean Keely",
  title =        "Parallel suffix array and least common prefix for the
                 {GPU}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "197--206",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Suffix Array (SA) is a data structure formed by
                 sorting the suffixes of a string into lexicographic
                 order. SAs have been used in a variety of applications,
                 most notably in pattern matching and Burrows--Wheeler
                 Transform (BWT) based lossless data compression. SAs
                 have also become the data structure of choice for many,
                 if not all, string processing problems to which suffix
                 tree methodology is applicable. Over the last two
                 decades researchers have proposed many suffix array
                 construction algorithm (SACAs). We do a systematic
                 study of the main classes of SACAs with the intent of
                 mapping them onto a data parallel architecture like the
                 GPU. We conclude that skew algorithm [12], a linear
                 time recursive algorithm, is the best candidate for
                 GPUs as all its phases can be efficiently mapped to a
                 data parallel hardware. Our OpenCL implementation of
                 skew algorithm achieves a throughput of up to 25
                 MStrings/sec and a speedup of up to 34x and 5.8x over a
                 single threaded CPU implementation using a discrete GPU
                 and APU respectively. We also compare our OpenCL
                 implementation against the fastest known CPU
                 implementation based on induced copying and achieve a
                 speedup of up to 3.7x. Using SA we construct BWT on GPU
                 and achieve a speedup of 11x over the fastest known BWT
                 on GPU. Suffix arrays are often augmented with the
                 longest common prefix (LCP) information. We design a
                 novel high-performance parallel algorithm for computing
                 LCP on the GPU. Our GPU implementation of LCP achieves
                 a speedup of up to 25x and 4.3x on discrete GPU and APU
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2013:SDR,
  author =       "Yufei Chen and Haibo Chen",
  title =        "Scalable deterministic replay in a parallel
                 full-system emulator",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "207--218",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442537",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Full-system emulation has been an extremely useful
                 tool in developing and debugging systems software like
                 operating systems and hypervisors. However, current
                 full-system emulators lack the support for
                 deterministic replay, which limits the reproducibility
                 of concurrency bugs that is indispensable for analyzing
                 and debugging the essentially multi-threaded systems
                 software. This paper analyzes the challenges in
                 supporting deterministic replay in parallel full-system
                 emulators and makes a comprehensive study on the
                 sources of non-determinism. Unlike application-level
                 replay systems, our system, called ReEmu, needs to log
                 sources of non-determinism in both the guest software
                 stack and the dynamic binary translator for faithful
                 replay. To provide scalable and efficient record and
                 replay on multicore machines, ReEmu makes several
                 notable refinements to the CREW protocol that replays
                 shared memory systems. First, being aware of the
                 performance bottlenecks in frequent lock operations in
                 the CREW protocol, ReEmu refines the CREW protocol with
                 a seqlock-like design, to avoid serious contention and
                 possible starvation in instrumentation code tracking
                 dependence of racy accesses on a shared memory object.
                 Second, to minimize the required log files, ReEmu only
                 logs minimal local information regarding accesses to a
                 shared memory location, but instead relies on an
                 offline log processing tool to derive precise shared
                 memory dependence for faithful replay. Third, ReEmu
                 adopts an automatic lock clustering mechanism that
                 clusters a set of uncontended memory objects to a bulk
                 to reduce the frequencies of lock operations, which
                 noticeably boost performance. Our prototype ReEmu is
                 based on our open-source COREMU system and supports
                 scalable and efficient record and replay of full-system
                 environments (both x64 and ARM). Performance evaluation
                 shows that ReEmu has very good performance scalability
                 on an Intel multicore machine. It incurs only 68.9\%
                 performance overhead on average (ranging from 51.8\% to
                 94.7\%) over vanilla COREMU to record five PARSEC
                 benchmarks running on a 16-core emulated system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Acar:2013:SPP,
  author =       "Umut A. Acar and Arthur Chargueraud and Mike Rainey",
  title =        "Scheduling parallel programs by work stealing with
                 private deques",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "219--228",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442538",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Work stealing has proven to be an effective method for
                 scheduling parallel programs on multicore computers. To
                 achieve high performance, work stealing distributes
                 tasks between concurrent queues, called deques, which
                 are assigned to each processor. Each processor operates
                 on its deque locally except when performing load
                 balancing via steals. Unfortunately, concurrent deques
                 suffer from two limitations: (1) local deque operations
                 require expensive memory fences in modern weak-memory
                 architectures, (2) they can be very difficult to extend
                 to support various optimizations and flexible forms of
                 task distribution strategies needed many applications,
                 e.g., those that do not fit nicely into the
                 divide-and-conquer, nested data parallel paradigm. For
                 these reasons, there has been a lot recent interest in
                 implementations of work stealing with non-concurrent
                 deques, where deques remain entirely private to each
                 processor and load balancing is performed via message
                 passing. Private deques eliminate the need for memory
                 fences from local operations and enable the design and
                 implementation of efficient techniques for reducing
                 task-creation overheads and improving task
                 distribution. These advantages, however, come at the
                 cost of communication. It is not known whether work
                 stealing with private deques enjoys the theoretical
                 guarantees of concurrent deques and whether they can be
                 effective in practice. In this paper, we propose two
                 work-stealing algorithms with private deques and prove
                 that the algorithms guarantee similar theoretical
                 bounds as work stealing with concurrent deques. For the
                 analysis, we use a probabilistic model and consider a
                 new parameter, the branching depth of the computation.
                 We present an implementation of the algorithm as a C++
                 library and show that it compares well to Cilk on a
                 range of benchmarks. Since our approach relies on
                 private deques, it enables implementing flexible task
                 creation and distribution strategies. As a specific
                 example, we show how to implement task coalescing and
                 steal-half strategies, which can be important in
                 fine-grain, non-divide-and-conquer algorithms such as
                 graph algorithms, and apply them to the
                 depth-first-search problem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yan:2013:SFS,
  author =       "Shengen Yan and Guoping Long and Yunquan Zhang",
  title =        "{StreamScan}: fast scan algorithms for {GPUs} without
                 global barrier synchronization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "229--238",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Scan (also known as prefix sum) is a very useful
                 primitive for various important parallel algorithms,
                 such as sort, BFS, SpMV, compaction and so on. Current
                 state of the art of GPU based scan implementation
                 consists of three consecutive Reduce-Scan-Scan phases.
                 This approach requires at least two global barriers and
                 3N (N is the problem size) global memory accesses. In
                 this paper we propose StreamScan, a novel approach to
                 implement scan on GPUs with only one computation phase.
                 The main idea is to restrict synchronization to only
                 adjacent workgroups, and thereby eliminating global
                 barrier synchronization completely. The new approach
                 requires only 2N global memory accesses and just one
                 kernel invocation. On top of this we propose two
                 important optimizations to further boost performance
                 speedups, namely thread grouping to eliminate
                 unnecessary local barriers, and register optimization
                 to expand the on chip problem size. We designed an
                 auto-tuning framework to search the parameter space
                 automatically to generate highly optimized codes for
                 both AMD and Nvidia GPUs. We implemented our technique
                 with OpenCL. Compared with previous fast scan
                 implementations, experimental results not only show
                 promising performance speedups, but also reveal
                 dramatic different optimization tradeoffs between
                 Nvidia and AMD GPU platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Heumann:2013:TEM,
  author =       "Stephen T. Heumann and Vikram S. Adve and Shengjie
                 Wang",
  title =        "The tasks with effects model for safe concurrency",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "239--250",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442540",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Today's widely-used concurrent programming models
                 either provide weak safety guarantees, making it easy
                 to write code with subtle errors, or are limited in the
                 class of programs that they can express. We propose a
                 new concurrent programming model based on tasks with
                 effects that offers strong safety guarantees while
                 still providing the flexibility needed to support the
                 many ways that concurrency is used in complex
                 applications. The core unit of work in our model is a
                 dynamically-created task. The model's key feature is
                 that each task has programmer-specified effects, and a
                 run-time scheduler is used to ensure that two tasks are
                 run concurrently only if they have non-interfering
                 effects. Through the combination of statically
                 verifying the declared effects of tasks and using an
                 effect-aware run-time scheduler, our model is able to
                 guarantee strong safety properties, including data race
                 freedom and atomicity. It is also possible to use our
                 model to write programs and computations that can be
                 statically proven to behave deterministically. We
                 describe the tasks with effects programming model and
                 provide a formal dynamic semantics for it. We also
                 describe our implementation of this model in an
                 extended version of Java and evaluate its use in
                 several programs exhibiting various patterns of
                 concurrency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bonetta:2013:TPE,
  author =       "Daniele Bonetta and Walter Binder and Cesare
                 Pautasso",
  title =        "{TigerQuoll}: parallel event-based {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "251--260",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442541",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "JavaScript, the most popular language on the Web, is
                 rapidly moving to the server-side, becoming even more
                 pervasive. Still, JavaScript lacks support for shared
                 memory parallelism, making it challenging for
                 developers to exploit multicores present in both
                 servers and clients. In this paper we present
                 TigerQuoll, a novel API and runtime for parallel
                 programming in JavaScript. TigerQuoll features an
                 event-based API and a parallel runtime allowing
                 applications to exploit a mutable shared memory space.
                 The programming model of TigerQuoll features automatic
                 consistency and concurrency management, such that
                 developers do not have to deal with shared-data
                 synchronization. TigerQuoll supports an innovative
                 transaction model that allows for eventual consistency
                 to speed up high-contention workloads. Experiments show
                 that TigerQuoll applications scale well, allowing one
                 to implement common parallelism patterns in
                 JavaScript.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dice:2013:UHT,
  author =       "Dave Dice and Yossi Lev and Yujie Liu and Victor
                 Luchangco and Mark Moir",
  title =        "Using hardware transactional memory to correct and
                 simplify and readers-writer lock algorithm",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "261--270",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442542",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Designing correct synchronization algorithms is
                 notoriously difficult, as evidenced by a bug we have
                 identified that has apparently gone unnoticed in a
                 well-known synchronization algorithm for nearly two
                 decades. We use hardware transactional memory (HTM) to
                 construct a corrected version of the algorithm. This
                 version is significantly simpler than the original and
                 furthermore improves on it by eliminating usage
                 constraints and reducing space requirements.
                 Performance of the HTM-based algorithm is competitive
                 with the original in ``normal'' conditions, but it does
                 suffer somewhat under heavy contention. We successfully
                 apply some optimizations to help close this gap, but we
                 also find that they are incompatible with known
                 techniques for improving progress properties. We
                 discuss ways in which future HTM implementations may
                 address these issues. Finally, although our focus is on
                 how effectively HTM can correct and simplify the
                 algorithm, we also suggest bug fixes and workarounds
                 that do not depend on HTM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cascaval:2013:ZPW,
  author =       "Calin Cascaval and Seth Fowler and Pablo
                 Montesinos-Ortego and Wayne Piekarski and Mehrdad
                 Reshadi and Behnam Robatmili and Michael Weber and
                 Vrajesh Bhavsar",
  title =        "{ZOOMM}: a parallel {Web} browser engine for multicore
                 mobile devices",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "271--280",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442543",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "We explore the challenges in expressing and managing
                 concurrency in browsers on mobile devices. Browsers are
                 complex applications that implement multiple standards,
                 need to support legacy behavior, and are highly dynamic
                 and interactive. We present ZOOMM, a highly concurrent
                 web browser engine prototype and show how concurrency
                 is effectively exploited at different levels: speed up
                 computation performance, preload network resources, and
                 preprocess resources outside the critical path of page
                 loading. On a dual-core Android mobile device we
                 demonstrate that ZOOMM is two times faster than the
                 native WebKit based browser when loading the set of
                 pages defined in the Vellamo benchmark.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Grasso:2013:APS,
  author =       "Ivan Grasso and Klaus Kofler and Biagio Cosenza and
                 Thomas Fahringer",
  title =        "Automatic problem size sensitive task partitioning on
                 heterogeneous parallel systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "281--282",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442545",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "In this paper we propose a novel approach which
                 automatizes task partitioning in heterogeneous systems.
                 Our framework is based on the Insieme Compiler and
                 Runtime infrastructure. The compiler translates a
                 single-device OpenCL program into a multi-device OpenCL
                 program. The runtime system then performs dynamic task
                 partitioning based on an offline-generated prediction
                 model. In order to derive the prediction model, we use
                 a machine learning approach that incorporates static
                 program features as well as dynamic, input sensitive
                 features. Our approach has been evaluated over a suite
                 of 23 programs and achieves performance improvements
                 compared to an execution of the benchmarks on a single
                 CPU and a single GPU only.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liu:2013:DLO,
  author =       "Jun Liu and Wei Ding and Ohyoung Jang and Mahmut
                 Kandemir",
  title =        "Data layout optimization for {GPGPU} architectures",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "283--284",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442546",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "GPUs are being widely used in accelerating
                 general-purpose applications, leading to the emergence
                 of GPGPU architectures. New programming models, e.g.,
                 Compute Unified Device Architecture (CUDA), have been
                 proposed to facilitate programming general-purpose
                 computations in GPGPUs. However, writing
                 high-performance CUDA codes manually is still tedious
                 and difficult. In particular, the organization of the
                 data in the memory space can greatly affect the
                 performance due to the unique features of a custom
                 GPGPU memory hierarchy. In this work, we propose an
                 automatic data layout transformation framework to solve
                 the key issues associated with a GPGPU memory hierarchy
                 (i.e., channel skewing, data coalescing, and bank
                 conflicts). Our approach employs a widely applicable
                 strategy based on a novel concept called data
                 localization. Specifically, we try to optimize the
                 layout of the arrays accessed in affine loop nests, for
                 both the device memory and shared memory, at both
                 coarse grain and fine grain parallelization levels. We
                 performed an experimental evaluation of our data layout
                 optimization strategy using 15 benchmarks on an NVIDIA
                 CUDA GPU device. The results show that the proposed
                 data transformation approach brings around 4.3X speedup
                 on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Padmanabhan:2013:DTO,
  author =       "Shobana Padmanabhan and Yixin Chen and Roger D.
                 Chamberlain",
  title =        "Decomposition techniques for optimal design-space
                 exploration of streaming applications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "285--286",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442547",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Streaming data programs are an important class of
                 applications, for which queueing network models are
                 frequently available. While the design space can be
                 large, decomposition techniques can be effective at
                 design space reduction. We introduce two decomposition
                 techniques called convex decomposition and unchaining
                 and present implications for a biosequence search
                 application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Yu:2013:EDA,
  author =       "Xiaodong Yu and Michela Becchi",
  title =        "Exploring different automata representations for
                 efficient regular expression matching on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "287--288",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442548",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Regular expression matching is a central task in
                 several networking (and search) applications and has
                 been accelerated on a variety of parallel
                 architectures. All solutions are based on finite
                 automata (either in deterministic or non-deterministic
                 form), and mostly focus on effective memory
                 representations for such automata. Recently, a handful
                 of work has proposed efficient regular expression
                 matching designs for GPUs; however, most of them aim at
                 achieving good performance on small datasets. Nowadays,
                 practical solutions must support the increased size and
                 complexity of real world datasets. In this work, we
                 explore the deployment and optimization of different
                 GPU designs of regular expression matching engines,
                 focusing on large datasets containing a large number of
                 complex patterns.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Edmonds:2013:EGA,
  author =       "Nick Edmonds and Jeremiah Willcock and Andrew
                 Lumsdaine",
  title =        "Expressing graph algorithms using generalized active
                 messages",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "289--290",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442549",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Recently, graph computation has emerged as an
                 important class of high-performance computing
                 application whose characteristics differ markedly from
                 those of traditional, compute-bound, kernels. Libraries
                 such as BLAS, LAPACK, and others have been successful
                 in codifying best practices in numerical computing. The
                 data-driven nature of graph applications necessitates a
                 more complex application stack incorporating runtime
                 optimization. In this paper, we present a method of
                 phrasing graph algorithms as collections of
                 asynchronous, concurrently executing, concise code
                 fragments which may be invoked both locally and in
                 remote address spaces. A runtime layer performs a
                 number of dynamic optimizations, including message
                 coalescing, message combining, and software routing.
                 Practical implementations and performance results are
                 provided for a number of representative algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lu:2013:MLP,
  author =       "Ligang Lu and Karen Magerlein",
  title =        "Multi-level parallel computing of reverse time
                 migration for seismic imaging on {Blue Gene/Q}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "291--292",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442550",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Blue Gene/Q (BG/Q) is an early representative of
                 increasing scale and thread count that will
                 characterize future HPC systems: large counts of nodes,
                 cores, and threads; and a rich programming environment
                 with many degrees of freedom in parallel computing
                 optimization. So it is both a challenge and an
                 opportunity to it to accelerate the seismic imaging
                 applications to the unprecedented levels that will
                 significantly advance the technologies for the oil and
                 gas industry. In this work we aim to address two
                 important questions: how HPC systems with high levels
                 of scale and thread count will perform in real
                 applications; and how systems with many degrees of
                 freedom in parallel programming can be calibrated to
                 achieve optimal performance. Based on BG/Q's
                 architecture features and RTM workload characteristics,
                 we developed massive domain partition, MPI, and SIMD
                 Our detailed deep analyses in various aspects of
                 optimization also provide valuable experience and
                 insights into how can be utilized to facilitate the
                 advance of seismic imaging technologies. Our BG/Q RTM
                 solution achieved a 14.93x speedup over the BG/P
                 implementation. Our multi-level parallelism strategies
                 for Reverse Time Migration (RTM) seismic imaging
                 computing on BG/Q provides an example of how HPC
                 systems like BG/Q can accelerate applications to a new
                 level.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Park:2013:PPB,
  author =       "Changhee Park and Guy L. {Steele, Jr.} and
                 Jean-Baptiste Tristan",
  title =        "Parallel programming with big operators",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "293--294",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442551",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "In the sciences, it is common to use the so-called
                 ``big operator'' notation to express the iteration of a
                 binary operator (the reducer) over a collection of
                 values. Such a notation typically assumes that the
                 reducer is associative and abstracts the iteration
                 process. Consequently, from a programming
                 point-of-view, we can organize the reducer operations
                 to minimize the depth of the overall reduction,
                 allowing a potentially parallel evaluation of a big
                 operator expression. We believe that the big operator
                 notation is indeed an effective construct to express
                 parallel computations in the Generate/Map/Reduce
                 programming model, and our goal is to introduce it in
                 programming languages to support parallel programming.
                 The effective definition of such a big operator
                 expression requires a simple way to generate elements,
                 and a simple way to declare algebraic properties of the
                 reducer (such as its identity, or its commutativity).
                 In this poster, we want to present an extension of
                 Scala with support for big operator expressions. We
                 show how big operator expressions are defined and how
                 the API is organized to support the simple definition
                 of reducers with their algebraic properties.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Afek:2013:PHL,
  author =       "Yehuda Afek and Amir Levy and Adam Morrison",
  title =        "Programming with hardware lock elision",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "295--296",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442552",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "We present a simple yet effective technique for
                 improving performance of lock-based code using the
                 hardware lock elision (HLE) feature in Intel's upcoming
                 Haswell processor. We also describe how to extend
                 Haswell's HLE mechanism to achieve a similar effect to
                 our lock elision scheme entirely in hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lu:2013:REM,
  author =       "Kai Lu and Xu Zhou and Xiaoping Wang and Wenzhe Zhang
                 and Gen Li",
  title =        "{RaceFree}: an efficient multi-threading model for
                 determinism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "297--298",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442553",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Current deterministic systems generally incur large
                 overhead due to the difficulty of detecting and
                 eliminating data races. This paper presents RaceFree, a
                 novel multi-threading runtime that adopts a relaxed
                 deterministic model to provide a data-race-free
                 environment for parallel programs. This model cuts off
                 unnecessary shared-memory communication by isolating
                 threads in separated memories, which eliminates direct
                 data races. Meanwhile, we leverage the happen-before
                 relation defined by applications themselves as one-way
                 communication pipes to perform necessary thread
                 communication. Shared-memory communication is
                 transparently converted to message-passing style
                 communication by our Memory Modification Propagation
                 (MMP) mechanism, which propagates local memory
                 modifications to other threads through the
                 happen-before relation pipes. The overhead of RaceFree
                 is 67.2\% according to our tests on parallel
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shun:2013:RCT,
  author =       "Julian Shun and Guy E. Blelloch and Jeremy T. Fineman
                 and Phillip B. Gibbons",
  title =        "Reducing contention through priority updates",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "299--300",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442554",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Diamos:2013:RAM,
  author =       "Gregory Diamos and Haicheng Wu and Jin Wang and Ashwin
                 Lele and Sudhakar Yalamanchili",
  title =        "Relational algorithms for multi-bulk-synchronous
                 processors",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "301--302",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442555",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Relational databases remain an important application
                 infrastructure for organizing and analyzing massive
                 volumes of data. At the same time, processor
                 architectures are increasingly gravitating towards
                 Multi-Bulk-Synchronous processor (Multi-BSP)
                 architectures employing throughput-optimized memory
                 systems, lightweight multi-threading, and
                 Single-Instruction Multiple-Data (SIMD) core
                 organizations. This paper explores the mapping of
                 primitive relational algebra operations onto such
                 architectures to improve the throughput of data
                 warehousing applications built on relational
                 databases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Carvalho:2013:RET,
  author =       "Fernando Miguel Carvalho and Jo{\~a}o Cachopo",
  title =        "Runtime elision of transactional barriers for captured
                 memory",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "303--304",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442556",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "In this paper, we propose a new technique that can
                 identify transaction-local memory (i.e. captured memory
                 ), in managed environments, while having a low runtime
                 overhead. We implemented our proposal in a well known
                 STM framework (Deuce) and we tested it in STMBench7
                 with two different STMs: TL2 and LSA. In both STMs the
                 performance improved significantly (4 times and 2.6
                 times, respectively). Moreover, running the STAMP
                 benchmarks with our approach shows improvements of 7
                 times in the best case for the Vacation application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Park:2013:SDR,
  author =       "Chang-Seo Park and Koushik Sen and Costin Iancu",
  title =        "Scalable data race detection for partitioned global
                 address space programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "305--306",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442557",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Contemporary and future programming languages for HPC
                 promote hybrid parallelism and shared memory
                 abstractions using a global address space. In this
                 programming style, data races occur easily and are
                 notoriously hard to find. Previous work on data race
                 detection for shared memory programs reports 10X-100X
                 slowdowns for non-scientific programs. Previous work on
                 distributed memory programs instruments only
                 communication operations. In this paper we present the
                 first complete implementation of data race detection at
                 scale for UPC programs. Our implementation tracks local
                 and global memory references in the program and it uses
                 two techniques to reduce the overhead: (1) hierarchical
                 function and instruction level sampling; and (2)
                 exploiting the runtime persistence of aliasing and
                 locality specific to Partitioned Global Address Space
                 applications. The results indicate that both techniques
                 are required in practice: well optimized instruction
                 sampling introduces overheads as high as 6500\% (65X
                 slowdown), while each technique in separation is able
                 to reduce it to 1000\% (10X slowdown). When applying
                 the optimizations in conjunction our tool finds all
                 previously known data races in our benchmark programs
                 with at most 50\% overhead. Furthermore, while previous
                 results illustrate the benefits of function level
                 sampling, our experiences show that this technique does
                 not work for scientific programs: instruction sampling
                 or a hybrid approach is required.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dice:2013:SSC,
  author =       "Dave Dice and Yossi Lev and Mark Moir",
  title =        "Scalable statistics counters",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "307--308",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442558",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Naive statistics counters that are commonly used to
                 monitor system events and performance become a
                 scalability bottleneck as systems become larger and
                 more NUMA; furthermore some are so inaccurate that they
                 are not useful. We present a number of techniques to
                 address these problems, evaluating solutions in terms
                 of performance, scalability, space overhead, and
                 accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wozniak:2013:SSD,
  author =       "Justin M. Wozniak and Timothy G. Armstrong and Michael
                 Wilde and Daniel S. Katz and Ewing Lusk and Ian T.
                 Foster",
  title =        "{Swift/T}: scalable data flow programming for
                 many-task applications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "309--310",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442559",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Swift/T, a novel programming language implementation
                 for highly scalable data flow programs, is presented.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cai:2013:TST,
  author =       "Yan Cai and Ke Zhai and Shangru Wu and W. K. Chan",
  title =        "{TeamWork}: synchronizing threads globally to detect
                 real deadlocks for multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "311--312",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442560",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "This paper presents the aim of TeamWork, our ongoing
                 effort to develop a comprehensive dynamic deadlock
                 confirmation tool for multithreaded programs. It also
                 presents a refined object abstraction algorithm that
                 refines the existing stack hash abstraction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{ElMehdiDiouri:2013:TEE,
  author =       "Mohammed {El Mehdi Diouri} and Olivier Gl{\"u}ck and
                 Laurent Lef{\`e}vre and Franck Cappello",
  title =        "Towards an energy estimator for fault tolerance
                 protocols",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "313--314",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442561",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Checkpointing protocols have different energy
                 consumption depending on parameters like application
                 features and platform characteristics. To select a
                 protocol for a given execution, we propose an energy
                 estimator that relies on an energy calibration of the
                 considered platform and a user description of the
                 execution settings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wimmer:2013:WSC,
  author =       "Martin Wimmer and Daniel Cederman and Jesper Larsson
                 Tr{\"a}ff and Philippas Tsigas",
  title =        "Work-stealing with configurable scheduling
                 strategies",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "315--316",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442562",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "Work-stealing systems are typically oblivious to the
                 nature of the tasks they are scheduling. They do not
                 know or take into account how long a task will take to
                 execute or how many subtasks it will spawn. Moreover,
                 task execution order is typically determined by an
                 underlying task storage data structure, and cannot be
                 changed. There are thus possibilities for optimizing
                 task parallel executions by providing information on
                 specific tasks and their preferred execution order to
                 the scheduling system. We investigate generalizations
                 of work-stealing and introduce a framework enabling
                 applications to dynamically provide hints on the nature
                 of specific tasks using scheduling strategies.
                 Strategies can be used to independently control both
                 local task execution and steal order. Strategies allow
                 optimizations on specific tasks, in contrast to more
                 conventional scheduling policies that are typically
                 global in scope. Strategies are composable and allow
                 different, specific scheduling choices for different
                 parts of an application simultaneously. We have
                 implemented a work-stealing system based on our
                 strategy framework. A series of benchmarks demonstrates
                 beneficial effects that can be achieved with scheduling
                 strategies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhou:2013:WED,
  author =       "Bowen Zhou and Milind Kulkarni and Saurabh Bagchi",
  title =        "{WuKong}: effective diagnosis of bugs at large system
                 scales",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "8",
  pages =        "317--318",
  month =        aug,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2517327.2442563",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Aug 26 13:48:51 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "PPoPP '13 Conference proceedings.",
  abstract =     "A key challenge in developing large scale applications
                 (both in system size and in input size) is finding bugs
                 that are latent at the small scales of testing, only
                 manifesting when a program is deployed at large scales.
                 Traditional statistical techniques fail because no
                 error-free run is available at deployment scales for
                 training purposes. Prior work used scaling models to
                 detect anomalous behavior at large scales without being
                 trained on correct behavior at that scale. However,
                 that work cannot localize bugs automatically. In this
                 paper, we extend that work in three ways: (i) we
                 develop an automatic diagnosis technique, based on
                 feature reconstruction; (ii) we design a heuristic to
                 effectively prune the feature space; and (iii) we
                 validate our design through one fault-injection study,
                 finding that our system can effectively localize bugs
                 in a majority of cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Norell:2013:IPD,
  author =       "Ulf Norell",
  title =        "Interactive programming with dependent types",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "1--2",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500610",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In dependently typed languages run-time values can
                 appear in types, making it possible to give programs
                 more precise types than in languages without dependent
                 types. This can range from keeping track of simple
                 invariants like the length of a list, to full
                 functional correctness. In addition to having some
                 correctness guarantees on the final program, assigning
                 more precise types to programs means that you can get
                 more assistance from the type checker while writing
                 them. This is what I focus on here, demonstrating how
                 the programming environment of Agda can help you when
                 developing dependently typed programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Traytel:2013:VDP,
  author =       "Dmitriy Traytel and Tobias Nipkow",
  title =        "Verified decision procedures for {MSO} on words based
                 on derivatives of regular expressions",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "3--12",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500612",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Monadic second-order logic on finite words (MSO) is a
                 decidable yet expressive logic into which many decision
                 problems can be encoded. Since MSO formulas correspond
                 to regular languages, equivalence of MSO formulas can
                 be reduced to the equivalence of some regular
                 structures (e.g. automata). This paper presents a
                 verified functional decision procedure for MSO formulas
                 that is not based on automata but on regular
                 expressions. Functional languages are ideally suited
                 for this task: regular expressions are data types and
                 functions on them are defined by pattern matching and
                 recursion and are verified by structural induction.
                 Decision procedures for regular expression equivalence
                 have been formalized before, usually based on
                 Brzozowski derivatives. Yet, for a straightforward
                 embedding of MSO formulas into regular expressions an
                 extension of regular expressions with a projection
                 operation is required. We prove total correctness and
                 completeness of an equivalence checker for regular
                 expressions extended in that way. We also define a
                 language-preserving translation of formulas into
                 regular expressions with respect to two different
                 semantics of MSO. Our results have been formalized and
                 verified in the theorem prover Isabelle. Using
                 Isabelle's code generation facility, this yields purely
                 functional, formally verified programs that decide
                 equivalence of MSO formulas.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Broadbent:2013:CSC,
  author =       "Christopher Broadbent and Arnaud Carayol and Matthew
                 Hague and Olivier Serre",
  title =        "{C-SHORe}: a collapsible approach to higher-order
                 verification",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "13--24",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500589",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Higher-order recursion schemes (HORS) have recently
                 received much attention as a useful abstraction of
                 higher-order functional programs with a number of new
                 verification techniques employing HORS model-checking
                 as their centrepiece. This paper contributes to the
                 ongoing quest for a truly scalable model-checker for
                 HORS by offering a different, automata theoretic
                 perspective. We introduce the first practical
                 model-checking algorithm that acts on a generalisation
                 of pushdown automata equi-expressive with HORS called
                 collapsible pushdown systems (CPDS). At its core is a
                 substantial modification of a recently studied
                 saturation algorithm for CPDS. In particular it is able
                 to use information gathered from an approximate forward
                 reachability analysis to guide its backward search.
                 Moreover, we introduce an algorithm that prunes the
                 CPDS prior to model-checking and a method for
                 extracting counter-examples in negative instances. We
                 compare our tool with the state-of-the-art verification
                 tools for HORS and obtain encouraging results. In
                 contrast to some of the main competition tackling the
                 same problem, our algorithm is fixed-parameter
                 tractable, and we also offer significantly improved
                 performance over the only previously published tool of
                 which we are aware that also enjoys this property. The
                 tool and additional material are available from
                 http://cshore.cs.rhul.ac.uk.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Petersen:2013:ASV,
  author =       "Leaf Petersen and Dominic Orchard and Neal Glew",
  title =        "Automatic {SIMD} vectorization for {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "25--36",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500605",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Expressing algorithms using immutable arrays greatly
                 simplifies the challenges of automatic SIMD
                 vectorization, since several important classes of
                 dependency violations cannot occur. The Haskell
                 programming language provides libraries for programming
                 with immutable arrays, and compiler support for
                 optimizing them to eliminate the overhead of
                 intermediate temporary arrays. We describe an
                 implementation of automatic SIMD vectorization in a
                 Haskell compiler which gives substantial vector
                 speedups for a range of programs written in a natural
                 programming style. We compare performance with that of
                 programs compiled by the Glasgow Haskell Compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Mainland:2013:EVI,
  author =       "Geoffrey Mainland and Roman Leshchinskiy and Simon
                 Peyton Jones",
  title =        "Exploiting vector instructions with generalized stream
                 fusion",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "37--48",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500601",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Stream fusion is a powerful technique for
                 automatically transforming high-level
                 sequence-processing functions into efficient
                 implementations. It has been used to great effect in
                 Haskell libraries for manipulating byte arrays, Unicode
                 text, and unboxed vectors. However, some operations,
                 like vector append, still do not perform well within
                 the standard stream fusion framework. Others, like SIMD
                 computation using the SSE and AVX instructions
                 available on modern x86 chips, do not seem to fit in
                 the framework at all. In this paper we introduce
                 generalized stream fusion, which solves these issues.
                 The key insight is to bundle together multiple stream
                 representations, each tuned for a particular class of
                 stream consumer. We also describe a stream
                 representation suited for efficient computation with
                 SSE instructions. Our ideas are implemented in modified
                 versions of the GHC compiler and vector library.
                 Benchmarks show that high-level Haskell code written
                 using our compiler and libraries can produce code that
                 is faster than both compiler- and hand-vectorized C.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{McDonell:2013:OPF,
  author =       "Trevor L. McDonell and Manuel M. T. Chakravarty and
                 Gabriele Keller and Ben Lippmeier",
  title =        "Optimising purely functional {GPU} programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "49--60",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500595",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Purely functional, embedded array programs are a good
                 match for SIMD hardware, such as GPUs. However, the
                 naive compilation of such programs quickly leads to
                 both code explosion and an excessive use of
                 intermediate data structures. The resulting slow-down
                 is not acceptable on target hardware that is usually
                 chosen to achieve high performance. In this paper, we
                 discuss two optimisation techniques, sharing recovery
                 and array fusion, that tackle code explosion and
                 eliminate superfluous intermediate structures. Both
                 techniques are well known from other contexts, but they
                 present unique challenges for an embedded language
                 compiled for execution on a GPU. We present novel
                 methods for implementing sharing recovery and array
                 fusion, and demonstrate their effectiveness on a set of
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Bernardy:2013:TTC,
  author =       "Jean-Philippe Bernardy and Moulin Guilhem",
  title =        "Type-theory in color",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "61--72",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500577",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dependent type-theory aims to become the standard way
                 to formalize mathematics at the same time as displacing
                 traditional platforms for high-assurance programming.
                 However, current implementations of type theory are
                 still lacking, in the sense that some obvious truths
                 require explicit proofs, making type-theory awkward to
                 use for many applications, both in formalization and
                 programming. In particular, notions of erasure are
                 poorly supported. In this paper we propose an extension
                 of type-theory with colored terms, color erasure and
                 interpretation of colored types as predicates. The
                 result is a more powerful type-theory: some definitions
                 and proofs may be omitted as they become trivial, it
                 becomes easier to program with precise types, and some
                 parametricity results can be internalized.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Devriese:2013:TSM,
  author =       "Dominique Devriese and Frank Piessens",
  title =        "Typed syntactic meta-programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "73--86",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500575",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a novel set of meta-programming primitives
                 for use in a dependently-typed functional language. The
                 types of our meta-programs provide strong and precise
                 guarantees about their termination, correctness and
                 completeness. Our system supports type-safe
                 construction and analysis of terms, types and typing
                 contexts. Unlike alternative approaches, they are
                 written in the same style as normal programs and use
                 the language's standard functional computational model.
                 We formalise the new meta-programming primitives,
                 implement them as an extension of Agda, and provide
                 evidence of usefulness by means of two compelling
                 applications in the fields of datatype-generic
                 programming and proof tactics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Ziliani:2013:MMT,
  author =       "Beta Ziliani and Derek Dreyer and Neelakantan R.
                 Krishnaswami and Aleksandar Nanevski and Viktor
                 Vafeiadis",
  title =        "{Mtac}: a monad for typed tactic programming in
                 {Coq}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "87--100",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500579",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Effective support for custom proof automation is
                 essential for large scale interactive proof
                 development. However, existing languages for automation
                 via *tactics* either (a) provide no way to specify the
                 behavior of tactics within the base logic of the
                 accompanying theorem prover, or (b) rely on advanced
                 type-theoretic machinery that is not easily integrated
                 into established theorem provers. We present Mtac, a
                 lightweight but powerful extension to Coq that supports
                 dependently-typed tactic programming. Mtac tactics have
                 access to all the features of ordinary Coq programming,
                 as well as a new set of typed tactical primitives. We
                 avoid the need to touch the trusted kernel typechecker
                 of Coq by encapsulating uses of these new tactical
                 primitives in a *monad*, and instrumenting Coq so that
                 it executes monadic tactics during type inference.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Dolan:2013:FSF,
  author =       "Stephen Dolan",
  title =        "Fun with semirings: a functional pearl on the abuse of
                 linear algebra",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "101--110",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500613",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Describing a problem using classical linear algebra is
                 a very well-known problem-solving technique. If your
                 question can be formulated as a question about real or
                 complex matrices, then the answer can often be found by
                 standard techniques. It's less well-known that very
                 similar techniques still apply where instead of real or
                 complex numbers we have a closed semiring, which is a
                 structure with some analogue of addition and
                 multiplication that need not support subtraction or
                 division. We define a typeclass in Haskell for
                 describing closed semirings, and implement a few
                 functions for manipulating matrices and polynomials
                 over them. We then show how these functions can be used
                 to calculate transitive closures, find shortest or
                 longest or widest paths in a graph, analyse the data
                 flow of imperative programs, optimally pack knapsacks,
                 and perform discrete event simulations, all by just
                 providing an appropriate underlying closed semiring.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Bernardy:2013:EDC,
  author =       "Jean-Philippe Bernardy and Koen Claessen",
  title =        "Efficient divide-and-conquer parsing of practical
                 context-free languages",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "111--122",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500576",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a divide-and-conquer algorithm for parsing
                 context-free languages efficiently. Our algorithm is an
                 instance of Valiant's (1975), who reduced the problem
                 of parsing to matrix multiplications. We show that,
                 while the conquer step of Valiant's is O ( n$^3$ ) in
                 the worst case, it improves to O (log n$^3$ ), under
                 certain conditions satisfied by many useful inputs.
                 These conditions occur for example in program texts
                 written by humans. The improvement happens because the
                 multiplications involve an overwhelming majority of
                 empty matrices. This result is relevant to modern
                 computing: divide-and-conquer algorithms can be
                 parallelized relatively easily.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Mairson:2013:FGT,
  author =       "Harry George Mairson",
  title =        "Functional geometry and the {Trait{\'e} de Lutherie}:
                 functional pearl",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "123--132",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500617",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a functional programming approach to the
                 design of outlines of eighteenth-century string
                 instruments. The approach is based on the research
                 described in Fran{\c{c}}ois Denis's book, Trait{\'e} de
                 lutherie. The programming vernacular for Denis's
                 instructions, which we call functional geometry, is
                 meant to reiterate the historically justified language
                 and techniques of this musical instrument design. The
                 programming metaphor is entirely Euclidean, involving
                 straightedge and compass constructions, with few (if
                 any) numbers, and no Cartesian equations or grid. As
                 such, it is also an interesting approach to teaching
                 programming and mathematics without numerical
                 calculation or equational reasoning. The advantage of
                 this language-based, functional approach to lutherie is
                 founded in the abstract characterization of common
                 patterns in instrument design. These patterns include
                 not only the abstraction of common straightedge and
                 compass constructions, but of higher-order
                 conceptualization of the instrument design process. We
                 also discuss the role of arithmetic, geometric,
                 harmonic, and subharmonic proportions, and the use of
                 their rational approximants.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Brady:2013:PRA,
  author =       "Edwin Brady",
  title =        "Programming and reasoning with algebraic effects and
                 dependent types",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "133--144",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500581",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "One often cited benefit of pure functional programming
                 is that pure code is easier to test and reason about,
                 both formally and informally. However, real programs
                 have side-effects including state management,
                 exceptions and interactions with the outside world.
                 Haskell solves this problem using monads to capture
                 details of possibly side-effecting computations --- it
                 provides monads for capturing state, I/O, exceptions,
                 non-determinism, libraries for practical purposes such
                 as CGI and parsing, and many others, as well as monad
                 transformers for combining multiple effects.
                 Unfortunately, useful as monads are, they do not
                 compose very well. Monad transformers can quickly
                 become unwieldy when there are lots of effects to
                 manage, leading to a temptation in larger programs to
                 combine everything into one coarse-grained state and
                 exception monad. In this paper I describe an
                 alternative approach based on handling algebraic
                 effects, implemented in the IDRIS programming language.
                 I show how to describe side effecting computations, how
                 to write programs which compose multiple fine-grained
                 effects, and how, using dependent types, we can use
                 this approach to reason about states in effectful
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Kammar:2013:HA,
  author =       "Ohad Kammar and Sam Lindley and Nicolas Oury",
  title =        "Handlers in action",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "145--158",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500590",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Plotkin and Pretnar's handlers for algebraic effects
                 occupy a sweet spot in the design space of abstractions
                 for effectful computation. By separating effect
                 signatures from their implementation, algebraic effects
                 provide a high degree of modularity, allowing
                 programmers to express effectful programs independently
                 of the concrete interpretation of their effects. A
                 handler is an interpretation of the effects of an
                 algebraic computation. The handler abstraction adapts
                 well to multiple settings: pure or impure, strict or
                 lazy, static types or dynamic types. This is a position
                 paper whose main aim is to popularise the handler
                 abstraction. We give a gentle introduction to its use,
                 a collection of illustrative examples, and a
                 straightforward operational semantics. We describe our
                 Haskell implementation of handlers in detail, outline
                 the ideas behind our OCaml, SML, and Racket
                 implementations, and present experimental results
                 comparing handlers with existing code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Jones:2013:CSS,
  author =       "Simon Peyton Jones",
  title =        "Computer science as a school subject",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "159--160",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500609",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computer science is one of the richest, most exciting
                 disciplines on the planet, yet any teenager will tell
                 you that ICT (as it is called in UK schools ---
                 ``information and communication technology'') is
                 focused almost entirely on the use and application of
                 computers, and in practice covers nothing about how
                 computers work, nor programming, nor anything of the
                 discipline of computer science as we understand it.
                 Over the last two decades, computing at school has
                 drifted from writing adventure games on the BBC Micro
                 to writing business plans in Excel. This is bad for our
                 young people's education, and it is bad for our
                 economy. Nor is this phenomenon restricted to the UK:
                 many countries are struggling with the same issues. Our
                 young people should be educated not only in the
                 application and use of digital technology, but also in
                 how it works, and its foundational principles. Lacking
                 such knowledge renders them powerless in the face of
                 complex and opaque technology, disenfranchises them
                 from making informed decisions about the digital
                 society, and deprives our nations of a well-qualified
                 stream of students enthusiastic and able to envision
                 and design new digital systems. Can anything be done,
                 given the enormous inertia of our various countries'
                 educational systems? Sometimes, yes. After a decade of
                 stasis, change has come to the UK. Over the last 18
                 months, there has been a wholesale reform of the
                 English school computing curriculum, and substantial
                 movement in Scotland and Wales. It now seems likely
                 that computer science will, for the first time, become
                 part of every child's education. This change has been
                 driven not by institutions or by the government, but by
                 a grass-roots movement of parents, teachers, university
                 academics, software developers, and others. A key agent
                 in this grass-roots movement---although not the only
                 one---is the Computing At School Working Group (CAS).
                 In this talk I will describe how CAS was born and
                 developed, and the radical changes that have taken
                 place since in the UK. I hope that this may be
                 encouraging for those pushing water uphill in other
                 parts of the world, and I will also try to draw out
                 some lessons from our experience that may be useful to
                 others.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Schmidt-Schauss:2013:CSH,
  author =       "Manfred Schmidt-Schau{\ss} and David Sabel",
  title =        "Correctness of an {STM} {Haskell} implementation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "161--172",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500585",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A concurrent implementation of software transactional
                 memory in Concurrent Haskell using a call-by-need
                 functional language with processes and futures is
                 given. The description of the small-step operational
                 semantics is precise and explicit, and employs an early
                 abort of conflicting transactions. A proof of
                 correctness of the implementation is given for a
                 contextual semantics with may- and should-convergence.
                 This implies that our implementation is a correct
                 evaluator for an abstract specification equipped with a
                 big-step semantics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Pottier:2013:PPM,
  author =       "Fran{\c{c}}ois Pottier and Jonathan Protzenko",
  title =        "Programming with permissions in {Mezzo}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "173--184",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500598",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Mezzo, a typed programming language of ML
                 lineage. Mezzo is equipped with a novel static
                 discipline of duplicable and affine permissions, which
                 controls aliasing and ownership. This rules out certain
                 mistakes, including representation exposure and data
                 races, and enables new idioms, such as gradual
                 initialization, memory re-use, and (type)state changes.
                 Although the core static discipline disallows sharing a
                 mutable data structure, Mezzo offers several ways of
                 working around this restriction, including a novel
                 dynamic ownership control mechanism which we dub
                 ``adoption and abandon''.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Abel:2013:WRC,
  author =       "Andreas M. Abel and Brigitte Pientka",
  title =        "Wellfounded recursion with copatterns: a unified
                 approach to termination and productivity",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "185--196",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500591",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we study strong normalization of a core
                 language based on System F-omega which supports
                 programming with finite and infinite structures.
                 Building on our prior work, finite data such as finite
                 lists and trees are defined via constructors and
                 manipulated via pattern matching, while infinite data
                 such as streams and infinite trees is defined by
                 observations and synthesized via copattern matching. In
                 this work, we take a type-based approach to strong
                 normalization by tracking size information about finite
                 and infinite data in the type. This guarantees
                 compositionality. More importantly, the duality of
                 pattern and copatterns provide a unifying semantic
                 concept which allows us for the first time to elegantly
                 and uniformly support both well-founded induction and
                 coinduction by mere rewriting. The strong normalization
                 proof is structured around Girard's reducibility
                 candidates. As such our system allows for
                 non-determinism and does not rely on coverage. Since
                 System F-omega is general enough that it can be the
                 target of compilation for the Calculus of
                 Constructions, this work is a significant step towards
                 representing observation-centric infinite data in proof
                 assistants such as Coq and Agda.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Atkey:2013:PCG,
  author =       "Robert Atkey and Conor McBride",
  title =        "Productive coprogramming with guarded recursion",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "197--208",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500597",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Total functional programming offers the beguiling
                 vision that, just by virtue of the compiler accepting a
                 program, we are guaranteed that it will always
                 terminate. In the case of programs that are not
                 intended to terminate, e.g., servers, we are guaranteed
                 that programs will always be productive. Productivity
                 means that, even if a program generates an infinite
                 amount of data, each piece will be generated in finite
                 time. The theoretical underpinning for productive
                 programming with infinite output is provided by the
                 category theoretic notion of final coalgebras. Hence,
                 we speak of co programming with non-well-founded co
                 data, as a dual to programming with well-founded data
                 like finite lists and trees. Systems that offer
                 facilities for productive coprogramming, such as the
                 proof assistants Coq and Agda, currently do so through
                 syntactic guardedness checkers. Syntactic guardedness
                 checkers ensure that all self-recursive calls are
                 guarded by a use of a constructor. Such a check ensures
                 productivity. Unfortunately, these syntactic checks are
                 not compositional, and severely complicate
                 coprogramming. Guarded recursion, originally due to
                 Nakano, is tantalising as a basis for a flexible and
                 compositional type-based approach to coprogramming.
                 However, as we show, by itself, guarded recursion is
                 not suitable for coprogramming due to the fact that
                 there is no way to make finite observations on pieces
                 of infinite data. In this paper, we introduce the
                 concept of clock variables that index Nakano's guarded
                 recursion. Clock variables allow us to ``close over''
                 the generation of infinite data, and to make finite
                 observations, something that is not possible with
                 guarded recursion alone.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Hinze:2013:USR,
  author =       "Ralf Hinze and Nicolas Wu and Jeremy Gibbons",
  title =        "Unifying structured recursion schemes",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "209--220",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500578",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Folds over inductive datatypes are well understood and
                 widely used. In their plain form, they are quite
                 restricted; but many disparate generalisations have
                 been proposed that enjoy similar calculational
                 benefits. There have also been attempts to unify the
                 various generalisations: two prominent such
                 unifications are the 'recursion schemes from comonads'
                 of Uustalu, Vene and Pardo, and our own 'adjoint
                 folds'. Until now, these two unified schemes have
                 appeared incompatible. We show that this appearance is
                 illusory: in fact, adjoint folds subsume recursion
                 schemes from comonads. The proof of this claim involves
                 standard constructions in category theory that are
                 nevertheless not well known in functional programming:
                 Eilenberg-Moore categories and bialgebras.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Krishnaswami:2013:HOF,
  author =       "Neelakantan R. Krishnaswami",
  title =        "Higher-order functional reactive programming without
                 spacetime leaks",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "221--232",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500588",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional reactive programming (FRP) is an elegant
                 approach to declaratively specify reactive systems.
                 However, the powerful abstractions of FRP have
                 historically made it difficult to predict and control
                 the resource usage of programs written in this style.
                 In this paper, we give a new language for higher-order
                 reactive programming. Our language generalizes and
                 simplifies prior type systems for reactive programming,
                 by supporting the use of streams of streams,
                 first-class functions, and higher-order operations. We
                 also support many temporal operations beyond streams,
                 such as terminatable streams, events, and even
                 resumptions with first-class schedulers. Furthermore,
                 our language supports an efficient implementation
                 strategy permitting us to eagerly deallocate old values
                 and statically rule out spacetime leaks, a notorious
                 source of inefficiency in reactive programs.
                 Furthermore, these memory guarantees are achieved
                 without the use of a complex substructural type
                 discipline. We also show that our implementation
                 strategy of eager deallocation is safe, by showing the
                 soundness of our type system with a novel step-indexed
                 Kripke logical relation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Jeffrey:2013:FRP,
  author =       "Alan Jeffrey",
  title =        "Functional reactive programming with liveness
                 guarantees",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "233--244",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500584",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional Reactive Programming (FRP) is an approach
                 to the development of reactive systems which provides a
                 pure functional interface, but which may be implemented
                 as an abstraction of an imperative event-driven layer.
                 FRP systems typically provide a model of behaviours
                 (total time-indexed values, implemented as pull
                 systems) and event sources (partial time-indexed
                 values, implemented as push systems). In this paper, we
                 investigate a type system for event-driven FRP programs
                 which provide liveness guarantees, that is every input
                 event is guaranteed to generate an output event. We
                 show that FRP can be implemented on top of a model of
                 sets and relations, and that the isomorphism between
                 event sources and behaviours corresponds to the
                 isomorphism between relations and set-valued functions.
                 We then implement sets and relations using a model of
                 continuations using the usual double-negation CPS
                 transform. The implementation of behaviours as pull
                 systems based on futures, and of event sources as push
                 systems based on the observer pattern, thus arises from
                 first principles. We also discuss a Java implementation
                 of the FRP model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Morihata:2013:SCP,
  author =       "Akimasa Morihata",
  title =        "A short cut to parallelization theorems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "245--256",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500580",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The third list-homomorphism theorem states that if a
                 function is both foldr and foldl, it has a
                 divide-and-conquer parallel implementation as well. In
                 this paper, we develop a theory for obtaining such
                 parallelization theorems. The key is a new proof of the
                 third list-homomorphism theorem based on shortcut
                 deforestation. The proof implies that there exists a
                 divide-and-conquer parallel program of the form of h (
                 x ' merge ' y ) = h$_1$ x odot h$_2$ y, where h is the
                 subject of parallelization, merge is the operation of
                 integrating independent substructures, h$_1$ and h$_2$
                 are computations applied to substructures, possibly in
                 parallel, and odot merges the results calculated for
                 substructures, if (i) h can be specified by two certain
                 forms of iterative programs, and (ii) merge can be
                 implemented by a function of a certain polymorphic
                 type. Therefore, when requirement (ii) is fulfilled, h
                 has a divide-and-conquer implementation if h has two
                 certain forms of implementations. We show that our
                 approach is applicable to structure-consuming
                 operations by catamorphisms (folds),
                 structure-generating operations by anamorphisms
                 (unfolds), and their generalizations called
                 hylomorphisms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Axelsson:2013:UCP,
  author =       "Emil Axelsson and Koen Claessen",
  title =        "Using circular programs for higher-order syntax:
                 functional pearl",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "257--262",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500614",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This pearl presents a novel technique for constructing
                 a first-order syntax tree directly from a higher-order
                 interface. We exploit circular programming to generate
                 names for new variables, resulting in a simple yet
                 efficient method. Our motivating application is the
                 design of embedded languages supporting variable
                 binding, where it is convenient to use higher-order
                 syntax when constructing programs, but first-order
                 syntax when processing or transforming programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Balabonski:2013:WOM,
  author =       "Thibaut Balabonski",
  title =        "Weak optimality, and the meaning of sharing",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "263--274",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500606",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper we investigate laziness and optimal
                 evaluation strategies for functional programming
                 languages. We consider the weak lambda-calculus as a
                 basis of functional programming languages, and we adapt
                 to this setting the concepts of optimal reductions that
                 were defined for the full lambda-calculus. We prove
                 that the usual implementation of call-by-need using
                 sharing is optimal, that is, normalizing any
                 lambda-term with call-by-need requires exactly the same
                 number of reduction steps as the shortest reduction
                 sequence in the weak lambda-calculus without sharing.
                 Furthermore, we prove that optimal reduction sequences
                 without sharing are not computable. Hence sharing is
                 the only computable means to reach weak optimality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Weirich:2013:SFE,
  author =       "Stephanie Weirich and Justin Hsu and Richard A.
                 Eisenberg",
  title =        "System {FC} with explicit kind equality",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "275--286",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500599",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "System FC, the core language of the Glasgow Haskell
                 Compiler, is an explicitly-typed variant of System F
                 with first-class type equality proofs called coercions.
                 This extensible proof system forms the foundation for
                 type system extensions such as type families
                 (type-level functions) and Generalized Algebraic
                 Datatypes (GADTs). Such features, in conjunction with
                 kind polymorphism and datatype promotion, support
                 expressive compile-time reasoning. However, the core
                 language lacks explicit kind equality proofs. As a
                 result, type-level computation does not have access to
                 kind-level functions or promoted GADTs, the type-level
                 analogues to expression-level features that have been
                 so useful. In this paper, we eliminate such
                 discrepancies by introducing kind equalities to System
                 FC. Our approach is based on dependent type systems
                 with heterogeneous equality and the ``Type-in-Type''
                 axiom, yet it preserves the metatheoretic properties of
                 FC. In particular, type checking is simple, decidable
                 and syntax directed. We prove the preservation and
                 progress theorems for the extended language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Sculthorpe:2013:CMP,
  author =       "Neil Sculthorpe and Jan Bracker and George Giorgidze
                 and Andy Gill",
  title =        "The constrained-monad problem",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "287--298",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500602",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In Haskell, there are many data types that would form
                 monads were it not for the presence of type-class
                 constraints on the operations on that data type. This
                 is a frustrating problem in practice, because there is
                 a considerable amount of support and infrastructure for
                 monads that these data types cannot use. Using several
                 examples, we show that a monadic computation can be
                 restructured into a normal form such that the standard
                 monad class can be used. The technique is not specific
                 to monads, and we show how it can also be applied to
                 other structures, such as applicative functors. One
                 significant use case for this technique is
                 domain-specific languages, where it is often desirable
                 to compile a deep embedding of a computation to some
                 other language, which requires restricting the types
                 that can appear in that computation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Svenningsson:2013:SCR,
  author =       "Josef David Svenningsson and Bo Joel Svensson",
  title =        "Simple and compositional reification of monadic
                 embedded languages",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "299--304",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500611",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When writing embedded domain specific languages in
                 Haskell, it is often convenient to be able to make an
                 instance of the Monad class to take advantage of the
                 do-notation and the extensive monad libraries. Commonly
                 it is desirable to compile such languages rather than
                 just interpret them. This introduces the problem of
                 monad reification, i.e. observing the structure of the
                 monadic computation. We present a solution to the monad
                 reification problem and illustrate it with a small
                 robot control language. Monad reification is not new
                 but the novelty of our approach is in its directness,
                 simplicity and compositionality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Hidaka:2013:SRQ,
  author =       "Soichiro Hidaka and Kazuyuki Asada and Zhenjiang Hu
                 and Hiroyuki Kato and Keisuke Nakano",
  title =        "Structural recursion for querying ordered graphs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "305--318",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500608",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Structural recursion, in the form of, for example,
                 folds on lists and catamorphisms on algebraic data
                 structures including trees, plays an important role in
                 functional programming, by providing a systematic way
                 for constructing and manipulating functional programs.
                 It is, however, a challenge to define structural
                 recursions for graph data structures, the most
                 ubiquitous sort of data in computing. This is because
                 unlike lists and trees, graphs are essentially not
                 inductive and cannot be formalized as an initial
                 algebra in general. In this paper, we borrow from the
                 database community the idea of structural recursion on
                 how to restrict recursions on infinite unordered
                 regular trees so that they preserve the finiteness
                 property and become terminating, which are desirable
                 properties for query languages. We propose a new graph
                 transformation language called lambdaFG for
                 transforming and querying ordered graphs, based on the
                 well-defined bisimulation relation on ordered graphs
                 with special epsilon-edges. The language lambdaFG is a
                 higher order graph transformation language that extends
                 the simply typed lambda calculus with graph
                 constructors and more powerful structural recursions,
                 which is extended for transformations on the sibling
                 dimension. It not only gives a general framework for
                 manipulating graphs and reasoning about them, but also
                 provides a solution to the open problem of how to
                 define a structural recursion on ordered graphs, with
                 the help of the bisimilarity for ordered graphs with
                 epsilon-edges.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Delaware:2013:MMM,
  author =       "Benjamin Delaware and Steven Keuchel and Tom
                 Schrijvers and Bruno C.d.S. Oliveira",
  title =        "Modular monadic meta-theory",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "319--330",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500587",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents 3MT, a framework for modular
                 mechanized meta-theory of languages with effects. Using
                 3MT, individual language features and their
                 corresponding definitions --- semantic functions,
                 theorem statements and proofs-- can be built separately
                 and then reused to create different languages with
                 fully mechanized meta-theory. 3MT combines modular
                 datatypes and monads to define denotational semantics
                 with effects on a per-feature basis, without fixing the
                 particular set of effects or language constructs. One
                 well-established problem with type soundness proofs for
                 denotational semantics is that they are notoriously
                 brittle with respect to the addition of new effects.
                 The statement of type soundness for a language depends
                 intimately on the effects it uses, making it
                 particularly challenging to achieve modularity. 3MT
                 solves this long-standing problem by splitting these
                 theorems into two separate and reusable parts: a
                 feature theorem that captures the well-typing of
                 denotations produced by the semantic function of an
                 individual feature with respect to only the effects
                 used, and an effect theorem that adapts well-typings of
                 denotations to a fixed superset of effects. The proof
                 of type soundness for a particular language simply
                 combines these theorems for its features and the
                 combination of their effects. To establish both
                 theorems, 3MT uses two key reasoning techniques:
                 modular induction and algebraic laws about effects.
                 Several effectful language features, including
                 references and errors, illustrate the capabilities of
                 3MT. A case study reuses these features to build fully
                 mechanized definitions and proofs for 28 languages,
                 including several versions of mini-ML with effects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Lorenzen:2013:MAT,
  author =       "Florian Lorenzen and Sebastian Erdweg",
  title =        "Modular and automated type-soundness verification for
                 language extensions",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "331--342",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500596",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Language extensions introduce high-level programming
                 constructs that protect programmers from low-level
                 details and repetitive tasks. For such an abstraction
                 barrier to be sustainable, it is important that no
                 errors are reported in terms of generated code. A
                 typical strategy is to check the original user code
                 prior to translation into a low-level encoding,
                 applying the assumption that the translation does not
                 introduce new errors. Unfortunately, such assumption is
                 untenable in general, but in particular in the context
                 of extensible programming languages, such as Racket or
                 SugarJ, that allow regular programmers to define
                 language extensions. In this paper, we present a
                 formalism for building and automatically verifying the
                 type-soundness of syntactic language extensions. To
                 build a type-sound language extension with our
                 formalism, a developer declares an extended syntax,
                 type rules for the extended syntax, and translation
                 rules into the (possibly further extended) base
                 language. Our formalism then validates that the
                 user-defined type rules are sufficient to guarantee
                 that the code generated by the translation rules cannot
                 contain any type errors. This effectively ensures that
                 an initial type check prior to translation precludes
                 type errors in generated code. We have implemented a
                 core system in PLT Redex and we have developed a
                 syntactically extensible variant of System F$_w$ that
                 we extend with let notation, monadic do blocks, and
                 algebraic data types. Our formalism verifies the
                 soundness of each extension automatically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Keep:2013:NFC,
  author =       "Andrew W. Keep and R. Kent Dybvig",
  title =        "A nanopass framework for commercial compiler
                 development",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "343--350",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500618",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Contemporary compilers must typically handle
                 sophisticated high-level source languages, generate
                 efficient code for multiple hardware architectures and
                 operating systems, and support source-level debugging,
                 profiling, and other program development tools. As a
                 result, compilers tend to be among the most complex of
                 software systems. Nanopass frameworks are designed to
                 help manage this complexity. A nanopass compiler is
                 comprised of many single-task passes with formally
                 defined intermediate languages. The perceived downside
                 of a nanopass compiler is that the extra passes will
                 lead to substantially longer compilation times. To
                 determine whether this is the case, we have created a
                 plug replacement for the commercial Chez Scheme
                 compiler, implemented using an updated nanopass
                 framework, and we have compared the speed of the new
                 compiler and the code it generates against the original
                 compiler for a large set of benchmark programs. This
                 paper describes the updated nanopass framework, the new
                 compiler, and the results of our experiments. The
                 compiler produces faster code than the original,
                 averaging 15-27\% depending on architecture and
                 optimization level, due to a more sophisticated but
                 slower register allocator and improvements to several
                 optimizations. Compilation times average well within a
                 factor of two of the original compiler, despite the
                 slower register allocator and the replacement of five
                 passes of the original 10 with over 50 nanopasses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{St-Amour:2013:ERA,
  author =       "Vincent St-Amour and Neil Toronto",
  title =        "Experience report: applying random testing to a base
                 type environment",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "351--356",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500616",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As programmers, programming in typed languages
                 increases our confidence in the correctness of our
                 programs. As type system designers, soundness proofs
                 increase our confidence in the correctness of our type
                 systems. There is more to typed languages than their
                 typing rules, however. To be usable, a typed language
                 needs to provide a well-furnished standard library and
                 to specify types for its exports. As software
                 artifacts, these base type environments can rival
                 typecheckers in complexity. Our experience with the
                 Typed Racket base environment---which accounts for 31\%
                 of the code in the Typed Racket
                 implementation---teaches us that writing type
                 environments can be just as error-prone as writing
                 typecheckers. We report on our experience over the past
                 two years of using random testing to increase our
                 confidence in the correctness of the Typed Racket base
                 environment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Petersen:2013:ERF,
  author =       "Christian L. Petersen and Matthias Gorges and Dustin
                 Dunsmuir and Mark Ansermino and Guy A. Dumont",
  title =        "Experience report: functional programming of {mHealth}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "357--362",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500615",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A modular framework for the development of medical
                 applications that promotes deterministic, robust and
                 correct code is presented. The system is based on the
                 portable Gambit Scheme programming language and
                 provides a flexible cross-platform environment for
                 developing graphical applications on mobile devices as
                 well as medical instrumentation interfaces running on
                 embedded platforms. Real world applications of this
                 framework for mobile diagnostics, telemonitoring and
                 automated drug infusions are reported. The source code
                 for the core framework is open source and available at:
                 https://github.com/part-cw/lambdanative.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Delbianco:2013:HSR,
  author =       "Germ{\'a}n Andr{\'e}s Delbianco and Aleksandar
                 Nanevski",
  title =        "{Hoare}-style reasoning with (algebraic)
                 continuations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "363--376",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500593",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Continuations are programming abstractions that allow
                 for manipulating the ``future'' of a computation.
                 Amongst their many applications, they enable
                 implementing unstructured program flow through
                 higher-order control operators such as callcc. In this
                 paper we develop a Hoare-style logic for the
                 verification of programs with higher-order control, in
                 the presence of dynamic state. This is done by
                 designing a dependent type theory with first class
                 callcc and abort operators, where pre- and
                 postconditions of programs are tracked through types.
                 Our operators are algebraic in the sense of Plotkin and
                 Power, and Jaskelioff, to reduce the annotation burden
                 and enable verification by symbolic evaluation. We
                 illustrate working with the logic by verifying a number
                 of characteristic examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Turon:2013:URH,
  author =       "Aaron Turon and Derek Dreyer and Lars Birkedal",
  title =        "Unifying refinement and {Hoare}-style reasoning in a
                 logic for higher-order concurrency",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "377--390",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500600",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modular programming and modular verification go hand
                 in hand, but most existing logics for concurrency
                 ignore two crucial forms of modularity: *higher-order
                 functions*, which are essential for building reusable
                 components, and *granularity abstraction*, a key
                 technique for hiding the intricacies of fine-grained
                 concurrent data structures from the clients of those
                 data structures. In this paper, we present CaReSL, the
                 first logic to support the use of granularity
                 abstraction for modular verification of higher-order
                 concurrent programs. After motivating the features of
                 CaReSL through a variety of illustrative examples, we
                 demonstrate its effectiveness by using it to tackle a
                 significant case study: the first formal proof of
                 (partial) correctness for Hendler et al.'s ``flat
                 combining'' algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Chlipala:2013:BSP,
  author =       "Adam Chlipala",
  title =        "The bedrock structured programming system: combining
                 generative metaprogramming and {Hoare} logic in an
                 extensible program verifier",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "391--402",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500592",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We report on the design and implementation of an
                 extensible programming language and its intrinsic
                 support for formal verification. Our language is
                 targeted at low-level programming of infrastructure
                 like operating systems and runtime systems. It is based
                 on a cross-platform core combining characteristics of
                 assembly languages and compiler intermediate languages.
                 From this foundation, we take literally the saying that
                 C is a ``macro assembly language'': we introduce an
                 expressive notion of certified low-level macros,
                 sufficient to build up the usual features of C and
                 beyond as macros with no special support in the core.
                 Furthermore, our macros have integrated support for
                 strongest postcondition calculation and verification
                 condition generation, so that we can provide a
                 high-productivity formal verification environment
                 within Coq for programs composed from any combination
                 of macros. Our macro interface is expressive enough to
                 support features that low-level programs usually only
                 access through external tools with no formal
                 guarantees, such as declarative parsing or SQL-inspired
                 querying. The abstraction level of these macros only
                 imposes a compile-time cost, via the execution of
                 functional Coq programs that compute programs in our
                 intermediate language; but the run-time cost is not
                 substantially greater than for more conventional C
                 code. We describe our experiences constructing a full
                 C-like language stack using macros, with some
                 experiments on the verifiability and performance of
                 individual programs running on that stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Cheney:2013:PTL,
  author =       "James Cheney and Sam Lindley and Philip Wadler",
  title =        "A practical theory of language-integrated query",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "403--416",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500586",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Language-integrated query is receiving renewed
                 attention, in part because of its support through
                 Microsoft's LINQ framework. We present a practical
                 theory of language-integrated query based on quotation
                 and normalisation of quoted terms. Our technique
                 supports join queries, abstraction over values and
                 predicates, composition of queries, dynamic generation
                 of queries, and queries with nested intermediate data.
                 Higher-order features prove useful even for
                 constructing first-order queries. We prove a theorem
                 characterising when a host query is guaranteed to
                 generate a single SQL query. We present experimental
                 results confirming our technique works, even in
                 situations where Microsoft's LINQ framework either
                 fails to produce an SQL query or, in one case, produces
                 an avalanche of SQL queries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Garcia:2013:CTB,
  author =       "Ronald Garcia",
  title =        "Calculating threesomes, with blame",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "417--428",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500603",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Coercions and threesomes both enable a language to
                 combine static and dynamic types while avoiding
                 cast-based space leaks. Coercion calculi elegantly
                 specify space-efficient cast behavior, even when
                 augmented with blame tracking, but implementing their
                 semantics directly is difficult. Threesomes, on the
                 other hand, have a straightforward recursive
                 implementation, but endowing them with blame tracking
                 is challenging. In this paper, we show that you can use
                 that elegant spec to produce that straightforward
                 implementation: we use the coercion calculus to derive
                 threesomes with blame. In particular, we construct
                 novel threesome calculi for blame tracking strategies
                 that detect errors earlier, catch more errors, and
                 reflect an intuitive conception of safe and unsafe
                 casts based on traditional subtyping.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Dunfield:2013:CEB,
  author =       "Joshua Dunfield and Neelakantan R. Krishnaswami",
  title =        "Complete and easy bidirectional typechecking for
                 higher-rank polymorphism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "429--442",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500582",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Bidirectional typechecking, in which terms either
                 synthesize a type or are checked against a known type,
                 has become popular for its scalability (unlike
                 Damas-Milner type inference, bidirectional typing
                 remains decidable even for very expressive type
                 systems), its error reporting, and its relative ease of
                 implementation. Following design principles from proof
                 theory, bidirectional typing can be applied to many
                 type constructs. The principles underlying a
                 bidirectional approach to polymorphism, however, are
                 less obvious. We give a declarative, bidirectional
                 account of higher-rank polymorphism, grounded in proof
                 theory; this calculus enjoys many properties such as
                 eta-reduction and predictability of annotations. We
                 give an algorithm for implementing the declarative
                 system; our algorithm is remarkably simple and
                 well-behaved, despite being both sound and complete.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Johnson:2013:OAA,
  author =       "J. Ian Johnson and Nicholas Labich and Matthew Might
                 and David {Van Horn}",
  title =        "Optimizing abstract abstract machines",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "443--454",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500604",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The technique of abstracting abstract machines (AAM)
                 provides a systematic approach for deriving computable
                 approximations of evaluators that are easily proved
                 sound. This article contributes a complementary
                 step-by-step process for subsequently going from a
                 naive analyzer derived under the AAM approach, to an
                 efficient and correct implementation. The end result of
                 the process is a two to three order-of-magnitude
                 improvement over the systematically derived analyzer,
                 making it competitive with hand-optimized
                 implementations that compute fundamentally less precise
                 results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Hritcu:2013:TNQ,
  author =       "Catalin Hritcu and John Hughes and Benjamin C. Pierce
                 and Antal Spector-Zabusky and Dimitrios Vytiniotis and
                 Arthur Azevedo de Amorim and Leonidas Lampropoulos",
  title =        "Testing noninterference, quickly",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "9",
  pages =        "455--468",
  month =        sep,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544174.2500574",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 27 18:32:10 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Information-flow control mechanisms are difficult to
                 design and labor intensive to prove correct. To reduce
                 the time wasted on proof attempts doomed to fail due to
                 broken definitions, we advocate modern random testing
                 techniques for finding counterexamples during the
                 design process. We show how to use QuickCheck, a
                 property-based random-testing tool, to guide the design
                 of a simple information-flow abstract machine. We find
                 that both sophisticated strategies for generating
                 well-distributed random programs and readily
                 falsifiable formulations of noninterference properties
                 are critically important. We propose several approaches
                 and evaluate their effectiveness on a collection of
                 injected bugs of varying subtlety. We also present an
                 effective technique for shrinking large counterexamples
                 to minimal, easily comprehensible ones. Taken together,
                 our best methods enable us to quickly and automatically
                 generate simple counterexamples for all these bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '13 conference proceedings.",
}

@Article{Lopes:2013:EAP,
  author =       "Cristina V. Lopes",
  title =        "Empirical analysis of programming language adoption",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "1--18",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509515",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Some programming languages become widely popular while
                 others fail to grow beyond their niche or disappear
                 altogether. This paper uses survey methodology to
                 identify the factors that lead to language adoption. We
                 analyze large datasets, including over 200,000
                 SourceForge projects, 590,000 projects tracked by
                 Ohloh, and multiple surveys of 1,000-13,000
                 programmers. We report several prominent findings.
                 First, language adoption follows a power law; a small
                 number of languages account for most language use, but
                 the programming market supports many languages with
                 niche user bases. Second, intrinsic features have only
                 secondary importance in adoption. Open source
                 libraries, existing code, and experience strongly
                 influence developers when selecting a language for a
                 project. Language features such as performance,
                 reliability, and simple semantics do not. Third,
                 developers will steadily learn and forget languages.
                 The overall number of languages developers are familiar
                 with is independent of age. Finally, when considering
                 intrinsic aspects of languages, developers prioritize
                 expressivity over correctness. They perceive static
                 types as primarily helping with the latter, hence
                 partly explaining the popularity of dynamic
                 languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2013:SSE,
  author =       "You Li and Zhendong Su and Linzhang Wang and Xuandong
                 Li",
  title =        "Steering symbolic execution to less traveled paths",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "19--32",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509553",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Symbolic execution is a promising testing and analysis
                 methodology. It systematically explores a program's
                 execution space and can generate test cases with high
                 coverage. One significant practical challenge for
                 symbolic execution is how to effectively explore the
                 enormous number of program paths in real-world
                 programs. Various heuristics have been proposed for
                 guiding symbolic execution, but they are generally
                 inefficient and ad-hoc. In this paper, we introduce a
                 novel, unified strategy to guide symbolic execution to
                 less explored parts of a program. Our key idea is to
                 exploit a specific type of path spectra, namely the
                 length-n subpath program spectra, to systematically
                 approximate full path information for guiding path
                 exploration. In particular, we use frequency
                 distributions of explored length- n subpaths to
                 prioritize ``less traveled'' parts of the program to
                 improve test coverage and error detection. We have
                 implemented our general strategy in KLEE, a
                 state-of-the-art symbolic execution engine. Evaluation
                 results on the GNU Coreutils programs show that (1)
                 varying the length n captures program-specific
                 information and exhibits different degrees of
                 effectiveness, and (2) our general approach outperforms
                 traditional strategies in both coverage and error
                 detection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Carbin:2013:VQR,
  author =       "Michael Carbin and Sasa Misailovic and Martin C.
                 Rinard",
  title =        "Verifying quantitative reliability for programs that
                 execute on unreliable hardware",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "33--52",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509546",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Emerging high-performance architectures are
                 anticipated to contain unreliable components that may
                 exhibit soft errors, which silently corrupt the results
                 of computations. Full detection and masking of soft
                 errors is challenging, expensive, and, for some
                 applications, unnecessary. For example, approximate
                 computing applications (such as multimedia processing,
                 machine learning, and big data analytics) can often
                 naturally tolerate soft errors. We present Rely a
                 programming language that enables developers to reason
                 about the quantitative reliability of an application
                 --- namely, the probability that it produces the
                 correct result when executed on unreliable hardware.
                 Rely allows developers to specify the reliability
                 requirements for each value that a function produces.
                 We present a static quantitative reliability analysis
                 that verifies quantitative requirements on the
                 reliability of an application, enabling a developer to
                 perform sound and verified reliability engineering. The
                 analysis takes a Rely program with a reliability
                 specification and a hardware specification that
                 characterizes the reliability of the underlying
                 hardware components and verifies that the program
                 satisfies its reliability specification when executed
                 on the underlying unreliable hardware platform. We
                 demonstrate the application of quantitative reliability
                 analysis on six computations implemented in Rely.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Huang:2013:ECS,
  author =       "Jipeng Huang and Michael D. Bond",
  title =        "Efficient context sensitivity for dynamic analyses via
                 calling context uptrees and customized memory
                 management",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "53--72",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "State-of-the-art dynamic bug detectors such as data
                 race and memory leak detectors report program locations
                 that are likely causes of bugs. However, programmers
                 need more than static program locations to understand
                 the behavior of increasingly complex and concurrent
                 software. Dynamic calling context provides additional
                 information, but it is expensive to record calling
                 context frequently, e.g., at every read and write.
                 Context-sensitive dynamic analyses can build and
                 maintain a calling context tree (CCT) to track calling
                 context--but in order to reuse existing nodes,
                 CCT-based approaches require an expensive lookup. This
                 paper introduces a new approach for context sensitivity
                 that avoids this expensive lookup. The approach uses a
                 new data structure called the calling context uptree
                 (CCU) that adds low overhead by avoiding the lookup and
                 instead allocating a new node for each context. A key
                 contribution is that the approach can mitigate the
                 costs of allocating many nodes by extending tracing
                 garbage collection (GC): GC collects unused CCU nodes
                 naturally and efficiently, and we extend GC to merge
                 duplicate nodes lazily. We implement our CCU-based
                 approach in a high-performance Java virtual machine and
                 integrate it with a staleness-based memory leak
                 detector and happens-before data race detector, so they
                 can report context-sensitive program locations that
                 cause bugs. We show that the CCU-based approach, in
                 concert with an extended GC, provides a compelling
                 alternative to CCT-based approaches for adding context
                 sensitivity to dynamic analyses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ureche:2013:MIS,
  author =       "Vlad Ureche and Cristian Talau and Martin Odersky",
  title =        "Miniboxing: improving the speed to code size tradeoff
                 in parametric polymorphism translations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "73--92",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509537",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Parametric polymorphism enables code reuse and type
                 safety. Underneath the uniform interface exposed to
                 programmers, however, its low level implementation has
                 to cope with inherently non-uniform data: value types
                 of different sizes and semantics (bytes, integers,
                 floating point numbers) and reference types (pointers
                 to heap objects). On the Java Virtual Machine,
                 parametric polymorphism is currently translated to
                 bytecode using two competing approaches: homogeneous
                 and heterogeneous. Homogeneous translation requires
                 boxing, and thus introduces indirect access delays.
                 Heterogeneous translation duplicates and adapts code
                 for each value type individually, producing more
                 bytecode. Therefore bytecode speed and size are at odds
                 with each other. This paper proposes a novel
                 translation that significantly reduces the bytecode
                 size without affecting the execution speed. The key
                 insight is that larger value types (such as integers)
                 can hold smaller ones (such as bytes) thus reducing the
                 duplication necessary in heterogeneous translations. In
                 our implementation, on the Scala compiler, we encode
                 all primitive value types in long integers. The
                 resulting bytecode approaches the performance of
                 monomorphic code, matches the performance of the
                 heterogeneous translation and obtains speedups of up to
                 22x over the homogeneous translation, all with modest
                 increases in size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shahriyar:2013:TGR,
  author =       "Rifat Shahriyar and Stephen Michael Blackburn and Xi
                 Yang and Kathryn S. McKinley",
  title =        "Taking off the gloves with reference counting
                 {Immix}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "93--110",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509527",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Despite some clear advantages and recent advances,
                 reference counting remains a poor cousin to
                 high-performance tracing garbage collectors. The
                 advantages of reference counting include (a) immediacy
                 of reclamation, (b) incrementality, and (c) local scope
                 of its operations. After decades of languishing with
                 hopelessly bad performance, recent work narrowed the
                 gap between reference counting and the fastest tracing
                 collectors to within 10\%. Though a major advance, this
                 gap remains a substantial barrier to adoption in
                 performance-conscious application domains. Our work
                 identifies heap organization as the principal source of
                 the remaining performance gap. We present the design,
                 implementation, and analysis of a new collector, RC
                 Immix, that replaces reference counting's traditional
                 free-list heap organization with the line and block
                 heap structure introduced by the Immix collector. The
                 key innovations of RC Immix are (1) to combine
                 traditional reference counts with per-line live object
                 counts to identify reusable memory and (2) to eliminate
                 fragmentation by integrating copying with reference
                 counting of new objects and with backup tracing cycle
                 collection. In RC Immix, reference counting offers
                 efficient collection and the line and block heap
                 organization delivers excellent mutator locality and
                 efficient allocation. With these advances, RC Immix
                 closes the 10\% performance gap, matching the
                 performance of a highly tuned production generational
                 collector. By removing the performance barrier, this
                 work transforms reference counting into a serious
                 alternative for meeting high performance objectives for
                 garbage collected languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Xu:2013:RTO,
  author =       "Guoqing Xu",
  title =        "{Resurrector}: a tunable object lifetime profiling
                 technique for optimizing real-world programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "111--130",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Modern object-oriented applications commonly suffer
                 from severe performance problems that need to be
                 optimized away for increased efficiency and user
                 satisfaction. Many existing optimization techniques
                 (such as object pooling and pretenuring) require
                 precise identification of object lifetimes. However, it
                 is particularly challenging to obtain object lifetimes
                 both precisely and efficiently: precise profiling
                 techniques such as Merlin introduce several hundred
                 times slowdown even for small programs while efficient
                 approximation techniques often sacrifice precision and
                 produce less useful lifetime information. This paper
                 presents a tunable profiling technique, called
                 Resurrector, that explores the middle ground between
                 high precision and high efficiency to find the
                 precision-efficiency sweetspot for various
                 liveness-based optimization techniques. Our evaluation
                 shows that Resurrector is both more precise and more
                 efficient than the GC-based approximation, and it is
                 orders-of-magnitude faster than Merlin. To demonstrate
                 Resurrector's usefulness, we have developed client
                 analyses to find allocation sites that create large
                 data structures with disjoint lifetimes. By inspecting
                 program source code and reusing data structures created
                 from these allocation sites, we have achieved
                 significant performance gains. We have also improved
                 the precision of an existing optimization technique
                 using the lifetime information collected by
                 Resurrector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Norris:2013:CCC,
  author =       "Brian Norris and Brian Demsky",
  title =        "{CDSChecker}: checking concurrent data structures
                 written with {C\slash C++} atomics",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "131--150",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509514",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Writing low-level concurrent software has
                 traditionally required intimate knowledge of the entire
                 toolchain and often has involved coding in assembly.
                 New language standards have extended C and C++ with
                 support for low-level atomic operations and a weak
                 memory model, enabling developers to write portable and
                 efficient multithreaded code. Developing correct
                 low-level concurrent code is well-known to be
                 especially difficult under a weak memory model, where
                 code behavior can be surprising. Building reliable
                 concurrent software using C/C++ low-level atomic
                 operations will likely require tools that help
                 developers discover unexpected program behaviors. In
                 this paper we present CDSChecker, a tool for
                 exhaustively exploring the behaviors of concurrent code
                 under the C/C++ memory model. We develop several novel
                 techniques for modeling the relaxed behaviors allowed
                 by the memory model and for minimizing the number of
                 execution behaviors that CDSChecker must explore. We
                 have used CDSChecker to exhaustively unit test several
                 concurrent data structure implementations on specific
                 inputs and have discovered errors in both a recently
                 published C11 implementation of a work-stealing queue
                 and a single producer, single consumer queue
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Raychev:2013:ERD,
  author =       "Veselin Raychev and Martin Vechev and Manu Sridharan",
  title =        "Effective race detection for event-driven programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "151--166",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509538",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Like shared-memory multi-threaded programs,
                 event-driven programs such as client-side web
                 applications are susceptible to data races that are
                 hard to reproduce and debug. Race detection for such
                 programs is hampered by their pervasive use of ad hoc
                 synchronization, which can lead to a prohibitive number
                 of false positives. Race detection also faces a
                 scalability challenge, as a large number of
                 short-running event handlers can quickly overwhelm
                 standard vector-clock-based techniques. This paper
                 presents several novel contributions that address both
                 of these challenges. First, we introduce race coverage,
                 a systematic method for exposing ad hoc synchronization
                 and other (potentially harmful) races to the user,
                 significantly reducing false positives. Second, we
                 present an efficient connectivity algorithm for
                 computing race coverage. The algorithm is based on
                 chain decomposition and leverages the structure of
                 event-driven programs to dramatically decrease the
                 overhead of vector clocks. We implemented our
                 techniques in a tool called EventRacer and evaluated it
                 on a number of public web sites. The results indicate
                 substantial performance and precision improvements of
                 our approach over the state-of-the-art. Using
                 EventRacer, we found many harmful races, most of which
                 are beyond the reach of current techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bolz:2013:SSC,
  author =       "Carl Friedrich Bolz and Lukas Diekmann and Laurence
                 Tratt",
  title =        "Storage strategies for collections in dynamically
                 typed languages",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "167--182",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Dynamically typed language implementations often use
                 more memory and execute slower than their statically
                 typed cousins, in part because operations on
                 collections of elements are unoptimised. This paper
                 describes storage strategies, which dynamically
                 optimise collections whose elements are instances of
                 the same primitive type. We implement storage
                 strategies in the PyPy virtual machine, giving a
                 performance increase of 18\% on wide-ranging benchmarks
                 of real Python programs. We show that storage
                 strategies are simple to implement, needing only
                 1500LoC in PyPy, and have applicability to a wide range
                 of virtual machines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Miller:2013:IPG,
  author =       "Heather Miller and Philipp Haller and Eugene Burmako
                 and Martin Odersky",
  title =        "Instant pickles: generating object-oriented pickler
                 combinators for fast and extensible serialization",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "183--202",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509547",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "As more applications migrate to the cloud, and as
                 ``big data'' edges into even more production
                 environments, the performance and simplicity of
                 exchanging data between compute nodes/devices is
                 increasing in importance. An issue central to
                 distributed programming, yet often under-considered, is
                 serialization or pickling, i.e., persisting runtime
                 objects by converting them into a binary or text
                 representation. Pickler combinators are a popular
                 approach from functional programming; their
                 composability alleviates some of the tedium of writing
                 pickling code by hand, but they don't translate well to
                 object-oriented programming due to qualities like open
                 class hierarchies and subtyping polymorphism.
                 Furthermore, both functional pickler combinators and
                 popular, Java-based serialization frameworks tend to be
                 tied to a specific pickle format, leaving programmers
                 with no choice of how their data is persisted. In this
                 paper, we present object-oriented pickler combinators
                 and a framework for generating them at compile-time,
                 called scala/pickling, designed to be the default
                 serialization mechanism of the Scala programming
                 language. The static generation of OO picklers enables
                 significant performance improvements, outperforming
                 Java and Kryo in most of our benchmarks. In addition to
                 high performance and the need for little to no
                 boilerplate, our framework is extensible: using the
                 type class pattern, users can provide both (1) custom,
                 easily interchangeable pickle formats and (2) custom
                 picklers, to override the default behavior of the
                 pickling framework. In benchmarks, we compare
                 scala/pickling with other popular industrial
                 frameworks, and present results on time, memory usage,
                 and size when pickling/unpickling a number of data
                 types used in real-world, large-scale distributed
                 applications and frameworks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Salkeld:2013:IDO,
  author =       "Robin Salkeld and Gregor Kiczales",
  title =        "Interacting with dead objects",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "203--216",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509543",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Debugging and analyzing a snapshot of a crashed
                 program's memory is far more difficult than working
                 with a live program, because debuggers can no longer
                 execute code to help make sense of the program state.
                 We present an architecture that supports the restricted
                 execution of ordinary code starting from the snapshot,
                 as if the dead objects within it had been restored, but
                 without access to their original external environment.
                 We demonstrate the feasibility of this approach via an
                 implementation for Java that does not require a custom
                 virtual machine, show that it performs competitively
                 with live execution, and use it to diagnose an
                 unresolved memory leak in a mature mainstream
                 application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Politz:2013:PFM,
  author =       "Joe Gibbs Politz and Alejandro Martinez and Matthew
                 Milano and Sumner Warren and Daniel Patterson and
                 Junsong Li and Anand Chitipothu and Shriram
                 Krishnamurthi",
  title =        "{Python}: the full monty",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "217--232",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We present a small-step operational semantics for the
                 Python programming language. We present both a core
                 language for Python, suitable for tools and proofs, and
                 a translation process for converting Python source to
                 this core. We have tested the composition of
                 translation and evaluation of the core for conformance
                 with the primary Python implementation, thereby giving
                 confidence in the fidelity of the semantics. We briefly
                 report on the engineering of these components. Finally,
                 we examine subtle aspects of the language, identifying
                 scope as a pervasive concern that even impacts features
                 that might be considered orthogonal.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gerakios:2013:FIS,
  author =       "Prodromos Gerakios and Aggelos Biboudis and Yannis
                 Smaragdakis",
  title =        "Forsaking inheritance: supercharged delegation in
                 {DelphJ}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "233--252",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509535",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We propose DelphJ: a Java-based OO language that
                 eschews inheritance completely, in favor of a
                 combination of class morphing and (deep) delegation.
                 Compared to past delegation approaches, the novel
                 aspect of our design is the ability to emulate the best
                 aspects of inheritance while retaining maximum
                 flexibility: using morphing, a class can select any of
                 the methods of its delegatee and export them (if
                 desired) or transform them (e.g., to add extra
                 arguments or modify type signatures), yet without
                 needing to name these methods explicitly and handle
                 them one-by-one. Compared to past work on morphing, our
                 approach adopts and adapts advanced delegation
                 mechanisms, in order to add late binding capabilities
                 and, thus, provide a full substitute of inheritance.
                 Additionally, we explore complex semantic issues in the
                 interaction of delegation with late binding. We present
                 our language design both informally, with numerous
                 examples, and formally in a core calculus.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Smaragdakis:2013:SBP,
  author =       "Yannis Smaragdakis and George Balatsouras and George
                 Kastrinis",
  title =        "Set-based pre-processing for points-to analysis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "253--270",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509524",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We present set-based pre-analysis: a virtually
                 universal optimization technique for flow-insensitive
                 points-to analysis. Points-to analysis computes a
                 static abstraction of how object values flow through a
                 program's variables. Set-based pre-analysis relies on
                 the observation that much of this reasoning can take
                 place at the set level rather than the value level.
                 Computing constraints at the set level results in
                 significant optimization opportunities: we can rewrite
                 the input program into a simplified form with the same
                 essential points-to properties. This rewrite results in
                 removing both local variables and instructions, thus
                 simplifying the subsequent value-based points-to
                 computation. Effectively, set-based pre-analysis puts
                 the program in a normal form optimized for points-to
                 analysis. Compared to other techniques for off-line
                 optimization of points-to analyses in the literature,
                 the new elements of our approach are the ability to
                 eliminate statements, and not just variables, as well
                 as its modularity: set-based pre-analysis can be
                 performed on the input just once, e.g., allowing the
                 pre-optimization of libraries that are subsequently
                 reused many times and for different analyses. In
                 experiments with Java programs, set-based pre-analysis
                 eliminates 30\% of the program's local variables and
                 30\% or more of computed context-sensitive points-to
                 facts, over a wide set of benchmarks and analyses,
                 resulting in a ~20\% average speedup (max: 110\%,
                 median: 18\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tetali:2013:MSA,
  author =       "Sai Deep Tetali and Mohsen Lesani and Rupak Majumdar
                 and Todd Millstein",
  title =        "{MrCrypt}: static analysis for secure cloud
                 computations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "271--286",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509554",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "In a common use case for cloud computing, clients
                 upload data and computation to servers that are managed
                 by a third-party infrastructure provider. We describe
                 MrCrypt, a system that provides data confidentiality in
                 this setting by executing client computations on
                 encrypted data. MrCrypt statically analyzes a program
                 to identify the set of operations on each input data
                 column, in order to select an appropriate homomorphic
                 encryption scheme for that column, and then transforms
                 the program to operate over encrypted data. The
                 encrypted data and transformed program are uploaded to
                 the server and executed as usual, and the result of the
                 computation is decrypted on the client side. We have
                 implemented MrCrypt for Java and illustrate its
                 practicality on three standard benchmark suites for the
                 Hadoop MapReduce framework. We have also formalized the
                 approach and proven several soundness and security
                 guarantees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{DeLozier:2013:ICL,
  author =       "Christian DeLozier and Richard Eisenberg and Santosh
                 Nagarakatte and Peter-Michael Osera and Milo M. K.
                 Martin and Steve Zdancewic",
  title =        "{Ironclad C++}: a library-augmented type-safe subset
                 of {C++}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "287--304",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509550",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "The C++ programming language remains widely used,
                 despite inheriting many unsafe features from
                 C---features that often lead to failures of type or
                 memory safety that manifest as buffer overflows,
                 use-after-free vulnerabilities, or abstraction
                 violations. Malicious attackers can exploit such
                 violations to compromise application and system
                 security. This paper introduces Ironclad C++, an
                 approach to bringing the benefits of type and memory
                 safety to C++. Ironclad C++ is, in essence, a
                 library-augmented, type-safe subset of C++. All
                 Ironclad C++ programs are valid C++ programs that can
                 be compiled using standard, off-the-shelf C++
                 compilers. However, not all valid C++ programs are
                 valid Ironclad C++ programs: a syntactic source-code
                 validator statically prevents the use of unsafe C++
                 features. To enforce safety properties that are
                 difficult to check statically, Ironclad C++ applies
                 dynamic checks via templated ``smart pointer'' classes.
                 Using a semi-automatic refactoring tool, we have ported
                 nearly 50K lines of code to Ironclad C++. These
                 benchmarks incur a performance overhead of 12\% on
                 average, compared to the original unsafe C++ code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Richards:2013:FAC,
  author =       "Gregor Richards and Christian Hammer and Francesco
                 Zappa Nardelli and Suresh Jagannathan and Jan Vitek",
  title =        "Flexible access control for {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "305--322",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509542",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Providing security guarantees for systems built out of
                 untrusted components requires the ability to define and
                 enforce access control policies over untrusted code. In
                 Web 2.0 applications, JavaScript code from different
                 origins is often combined on a single page, leading to
                 well-known vulnerabilities. We present a security
                 infrastructure which allows users and content providers
                 to specify access control policies over subsets of a
                 JavaScript program by leveraging the concept of
                 delimited histories with revocation. We implement our
                 proposal in WebKit and evaluate it with three policies
                 on 50 widely used websites with no changes to their
                 JavaScript code and report performance overheads and
                 violations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Feldthaus:2013:SAR,
  author =       "Asger Feldthaus and Anders M{\o}ller",
  title =        "Semi-automatic rename refactoring for {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "323--338",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509520",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Modern IDEs support automated refactoring for many
                 programming languages, but support for JavaScript is
                 still primitive. To perform renaming, which is one of
                 the fundamental refactorings, there is often no
                 practical alternative to simple syntactic
                 search-and-replace. Although more sophisticated
                 alternatives have been developed, they are limited by
                 whole-program assumptions and poor scalability. We
                 propose a technique for semi-automatic refactoring for
                 JavaScript, with a focus on renaming. Unlike
                 traditional refactoring algorithms, semi-automatic
                 refactoring works by a combination of static analysis
                 and interaction with the programmer. With this
                 pragmatic approach, we can provide scalable and
                 effective refactoring support for real-world code,
                 including libraries and incomplete applications.
                 Through a series of experiments that estimate how much
                 manual effort our technique demands from the
                 programmer, we show that our approach is a useful
                 improvement compared to search-and-replace tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Raychev:2013:RS,
  author =       "Veselin Raychev and Max Sch{\"a}fer and Manu Sridharan
                 and Martin Vechev",
  title =        "Refactoring with synthesis",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "339--354",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509544",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Refactoring has become an integral part of modern
                 software development, with wide support in popular
                 integrated development environments (IDEs). Modern IDEs
                 provide a fixed set of supported refactorings, listed
                 in a refactoring menu. But with IDEs supporting more
                 and more refactorings, it is becoming increasingly
                 difficult for programmers to discover and memorize all
                 their names and meanings. Also, since the set of
                 refactorings is hard-coded, if a programmer wants to
                 achieve a slightly different code transformation, she
                 has to either apply a (possibly non-obvious) sequence
                 of several built-in refactorings, or just perform the
                 transformation by hand. We propose a novel approach to
                 refactoring, based on synthesis from examples, which
                 addresses these limitations. With our system, the
                 programmer need not worry how to invoke individual
                 refactorings or the order in which to apply them.
                 Instead, a transformation is achieved via three simple
                 steps: the programmer first indicates the start of a
                 code refactoring phase; then she performs some of the
                 desired code changes manually; and finally, she asks
                 the tool to complete the refactoring. Our system
                 completes the refactoring by first extracting the
                 difference between the starting program and the
                 modified version, and then synthesizing a sequence of
                 refactorings that achieves (at least) the desired
                 changes. To enable scalable synthesis, we introduce
                 local refactorings, which allow for first discovering a
                 refactoring sequence on small program fragments and
                 then extrapolating it to a full refactoring sequence.
                 We implemented our approach as an Eclipse plug-in, with
                 an architecture that is easily extendible with new
                 refactorings. The experimental results are encouraging:
                 with only minimal user input, the synthesizer was able
                 to quickly discover complex refactoring sequences for
                 several challenging realistic examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bois:2013:BGV,
  author =       "Kristof {Du Bois} and Jennifer B. Sartor and Stijn
                 Eyerman and Lieven Eeckhout",
  title =        "Bottle graphs: visualizing scalability bottlenecks in
                 multi-threaded applications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "355--372",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Understanding and analyzing multi-threaded program
                 performance and scalability is far from trivial, which
                 severely complicates parallel software development and
                 optimization. In this paper, we present bottle graphs,
                 a powerful analysis tool that visualizes multi-threaded
                 program performance, in regards to both per-thread
                 parallelism and execution time. Each thread is
                 represented as a box, with its height equal to the
                 share of that thread in the total program execution
                 time, its width equal to its parallelism, and its area
                 equal to its total running time. The boxes of all
                 threads are stacked upon each other, leading to a stack
                 with height equal to the total program execution time.
                 Bottle graphs show exactly how scalable each thread is,
                 and thus guide optimization towards those threads that
                 have a smaller parallel component (narrower), and a
                 larger share of the total execution time (taller), i.e.
                 to the 'neck' of the bottle. Using light-weight OS
                 modules, we calculate bottle graphs for unmodified
                 multi-threaded programs running on real processors with
                 an average overhead of 0.68\%. To demonstrate their
                 utility, we do an extensive analysis of 12 Java
                 benchmarks running on top of the Jikes JVM, which
                 introduces many JVM service threads. We not only reveal
                 and explain scalability limitations of several
                 well-known Java benchmarks; we also analyze the reasons
                 why the garbage collector itself does not scale, and in
                 fact performs optimally with two collector threads for
                 all benchmarks, regardless of the number of application
                 threads. Finally, we compare the scalability of Jikes
                 versus the OpenJDK JVM. We demonstrate how useful and
                 intuitive bottle graphs are as a tool to analyze
                 scalability and help optimize multi-threaded
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{DElia:2013:BLP,
  author =       "Daniele Cono D'Elia and Camil Demetrescu",
  title =        "{Ball--Larus} path profiling across multiple loop
                 iterations",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "373--390",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509521",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Identifying the hottest paths in the control flow
                 graph of a routine can direct optimizations to portions
                 of the code where most resources are consumed. This
                 powerful methodology, called path profiling, was
                 introduced by Ball and Larus in the mid 90's [4] and
                 has received considerable attention in the last 15
                 years for its practical relevance. A shortcoming of the
                 Ball-Larus technique was the inability to profile
                 cyclic paths, making it difficult to mine execution
                 patterns that span multiple loop iterations. Previous
                 results, based on rather complex algorithms, have
                 attempted to circumvent this limitation at the price of
                 significant performance losses even for a small number
                 of iterations. In this paper, we present a new approach
                 to multi-iteration path profiling, based on data
                 structures built on top of the original Ball-Larus
                 numbering technique. Our approach allows the profiling
                 of all executed paths obtained as a concatenation of up
                 to k Ball-Larus acyclic paths, where k is a
                 user-defined parameter. We provide examples showing
                 that this method can reveal optimization opportunities
                 that acyclic-path profiling would miss. An extensive
                 experimental investigation on a large variety of Java
                 benchmarks on the Jikes RVM shows that our approach can
                 be even faster than Ball-Larus due to fewer operations
                 on smaller hash tables, producing compact
                 representations of cyclic paths even for large values
                 of k.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sharma:2013:DDE,
  author =       "Rahul Sharma and Eric Schkufza and Berkeley Churchill
                 and Alex Aiken",
  title =        "Data-driven equivalence checking",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "391--406",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509509",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We present a data driven algorithm for equivalence
                 checking of two loops. The algorithm infers simulation
                 relations using data from test runs. Once a candidate
                 simulation relation has been obtained, off-the-shelf
                 SMT solvers are used to check whether the simulation
                 relation actually holds. The algorithm is sound:
                 insufficient data will cause the proof to fail. We
                 demonstrate a prototype implementation, called DDEC, of
                 our algorithm, which is the first sound equivalence
                 checker for loops written in x86 assembly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kneuss:2013:SMR,
  author =       "Etienne Kneuss and Ivan Kuraj and Viktor Kuncak and
                 Philippe Suter",
  title =        "Synthesis modulo recursive functions",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "407--426",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509555",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We describe techniques for synthesis and verification
                 of recursive functional programs over unbounded
                 domains. Our techniques build on top of an algorithm
                 for satisfiability modulo recursive functions, a
                 framework for deductive synthesis, and complete
                 synthesis procedures for algebraic data types. We
                 present new counterexample-guided algorithms for
                 constructing verified programs. We have implemented
                 these algorithms in an integrated environment for
                 interactive verification and synthesis from relational
                 specifications. Our system was able to synthesize a
                 number of useful recursive functions that manipulate
                 unbounded numbers and data structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Shi:2013:COU,
  author =       "Yao Shi and Bernard Blackham and Gernot Heiser",
  title =        "Code optimizations using formally verified
                 properties",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "427--442",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509513",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Formal program verification offers strong assurance of
                 correctness, backed by the strength of mathematical
                 proof. Constructing these proofs requires humans to
                 identify program invariants, and show that they are
                 always maintained. These invariants are then used to
                 prove that the code adheres to its specification. In
                 this paper, we explore the overlap between formal
                 verification and code optimization. We propose two
                 approaches to reuse the invariants derived in formal
                 proofs and integrate them into compilation. The first
                 applies invariants extracted from the proof, while the
                 second leverages the property of program safety (i.e.,
                 the absence of bugs). We reuse this information to
                 improve the performance of generated object code. We
                 evaluated these methods on seL4, a real-world
                 formally-verified microkernel, and obtained
                 improvements in average runtime performance (up to
                 28\%) and in worst-case execution time (up to 25\%). In
                 macro-benchmarks, we found the performance of
                 para-virtualized Linux running on the microkernel
                 improved by 6-16\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dillig:2013:IIG,
  author =       "Isil Dillig and Thomas Dillig and Boyang Li and Ken
                 McMillan",
  title =        "Inductive invariant generation via abductive
                 inference",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "443--456",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509511",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "This paper presents a new method for generating
                 inductive loop invariants that are expressible as
                 boolean combinations of linear integer constraints. The
                 key idea underlying our technique is to perform a
                 backtracking search that combines Hoare-style
                 verification condition generation with a logical
                 abduction procedure based on quantifier elimination to
                 speculate candidate invariants. Starting with true, our
                 method iteratively strengthens loop invariants until
                 they are inductive and strong enough to verify the
                 program. A key feature of our technique is that it is
                 lazy: It only infers those invariants that are
                 necessary for verifying program correctness.
                 Furthermore, our technique can infer arbitrary boolean
                 combinations (including disjunctions) of linear
                 invariants. We have implemented the proposed approach
                 in a tool called HOLA. Our experiments demonstrate that
                 HOLA can infer interesting invariants that are beyond
                 the reach of existing state-of-the-art invariant
                 generation tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hoppe:2013:DDB,
  author =       "Michael Hoppe and Stefan Hanenberg",
  title =        "Do developers benefit from generic types?: an
                 empirical comparison of generic and raw types in
                 {Java}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "457--474",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509528",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Type systems that permit developers to express
                 themselves more precisely are one of the primary topics
                 in programming language research, as well as in
                 industrial software development. While it seems
                 plausible that an expressive static type system
                 increases developer productivity, there is little
                 empirical evidence for or against this hypothesis.
                 Generic types in Java are an example: as an extension
                 of Java's original type system, some claim that Java
                 1.5 improves the type system's ``expressiveness.'' Even
                 if this claim is true, there exists little empirical
                 evidence that claimed expressiveness leads to a
                 measurable increase in developer productivity. This
                 paper introduces an experiment where generic types (in
                 comparison to raw types) have been evaluated in three
                 different directions: (1) the documentation impact on
                 undocumented APIs, (2) the time required for fixing
                 type errors, and (3) the extensibility of a generic
                 type hierarchy. The results of the experiment suggest
                 that generic types improve documentation and reduce
                 extensibility --- without revealing a difference in the
                 time required for fixing type errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dimoulas:2013:OC,
  author =       "Christos Dimoulas and Robert Bruce Findler and
                 Matthias Felleisen",
  title =        "Option contracts",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "475--494",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509548",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Many languages support behavioral software contracts
                 so that programmers can describe a component's
                 obligations and promises via logical assertions in its
                 interface. The contract system monitors program
                 execution, checks whether the assertions hold, and, if
                 not, blames the guilty component. Pinning down the
                 violator gets the debugging process started in the
                 right direction. Quality contracts impose a serious
                 run-time cost, however, and programmers therefore
                 compromise in many ways. Some turn off contracts for
                 deployment, but then contracts and code quickly get out
                 of sync during maintenance. Others test contracts
                 randomly or probabilistically. In all cases,
                 programmers have to cope with lack of blame information
                 when the program eventually fails. In response, we
                 propose option contracts as an addition to the contract
                 tool box. Our key insight is that in ordinary contract
                 systems, server components impose their contract on
                 client components, giving them no choice whether to
                 trust the server's promises or check them. With option
                 contracts, server components may choose to tag a
                 contract as an option and clients may choose to
                 exercise the option or accept it, in which case they
                 also shoulder some responsibility. We show that option
                 contracts permit programmers to specify flexible
                 checking policies, that their cost is reasonable, and
                 that they satisfy a complete monitoring theorem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Treichler:2013:LSD,
  author =       "Sean Treichler and Michael Bauer and Alex Aiken",
  title =        "Language support for dynamic, hierarchical data
                 partitioning",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "495--514",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509545",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Applications written for distributed-memory parallel
                 architectures must partition their data to enable
                 parallel execution. As memory hierarchies become
                 deeper, it is increasingly necessary that the data
                 partitioning also be hierarchical to match. Current
                 language proposals perform this hierarchical
                 partitioning statically, which excludes many important
                 applications where the appropriate partitioning is
                 itself data dependent and so must be computed
                 dynamically. We describe Legion, a region-based
                 programming system, where each region may be
                 partitioned into subregions. Partitions are computed
                 dynamically and are fully programmable. The division of
                 data need not be disjoint and subregions of a region
                 may overlap, or alias one another. Computations use
                 regions with certain privileges (e.g., expressing that
                 a computation uses a region read-only) and data
                 coherence (e.g., expressing that the computation need
                 only be atomic with respect to other operations on the
                 region), which can be controlled on a per-region (or
                 subregion) basis. We present the novel aspects of the
                 Legion design, in particular the combination of static
                 and dynamic checks used to enforce soundness. We give
                 an extended example illustrating how Legion can express
                 computations with dynamically determined relationships
                 between computations and data partitions. We prove the
                 soundness of Legion's type system, and show Legion type
                 checking improves performance by up to 71\% by eliding
                 provably safe memory checks. In particular, we show
                 that the dynamic checks to detect aliasing at runtime
                 at the region granularity have negligible overhead. We
                 report results for three real-world applications
                 running on distributed memory machines, achieving up to
                 62.5X speedup on 96 GPUs on the Keeneland
                 supercomputer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Balatsouras:2013:CHC,
  author =       "George Balatsouras and Yannis Smaragdakis",
  title =        "Class hierarchy complementation: soundly completing a
                 partial type graph",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "515--532",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509530",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We present the problem of class hierarchy
                 complementation: given a partially known hierarchy of
                 classes together with subtyping constraints (``A has to
                 be a transitive subtype of B'') complete the hierarchy
                 so that it satisfies all constraints. The problem has
                 immediate practical application to the analysis of
                 partial programs--e.g., it arises in the process of
                 providing a sound handling of ``phantom classes'' in
                 the Soot program analysis framework. We provide
                 algorithms to solve the hierarchy complementation
                 problem in the single inheritance and multiple
                 inheritance settings. We also show that the problem in
                 a language such as Java, with single inheritance but
                 multiple subtyping and distinguished class vs.
                 interface types, can be decomposed into separate
                 single- and multiple-subtyping instances. We implement
                 our algorithms in a tool, JPhantom, which complements
                 partial Java bytecode programs so that the result is
                 guaranteed to satisfy the Java verifier requirements.
                 JPhantom is highly scalable and runs in mere seconds
                 even for large input applications and complex
                 constraints (with a maximum of 14s for a 19MB
                 binary).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ravichandran:2013:MES,
  author =       "Kaushik Ravichandran and Santosh Pande",
  title =        "Multiverse: efficiently supporting distributed
                 high-level speculation",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "533--552",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509525",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Algorithmic speculation or high-level speculation is a
                 promising programming paradigm which allows programmers
                 to speculatively branch an execution into multiple
                 independent parallel sections and then choose the best
                 (perhaps fastest) amongst them. The continuing
                 execution after the speculatively branched section sees
                 only the modifications made by the best one. This
                 programming paradigm allows programmers to harness
                 parallelism and can provide dramatic performance
                 improvements. In this paper we present the Multiverse
                 speculative programming model. Multiverse allows
                 programmers to exploit parallelism through high-level
                 speculation. It can effectively harness large amounts
                 of parallelism by speculating across an entire cluster
                 and is not bound by the parallelism available in a
                 single machine. We present abstractions and a runtime
                 which allow programmers to introduce large scale
                 high-level speculative parallelism into applications
                 with minimal effort. We introduce a novel on-demand
                 address space sharing mechanism which provide
                 speculations efficient transparent access to the
                 original address space of the application (including
                 the use of pointers) across machine boundaries.
                 Multiverse provides single commit semantics across
                 speculations while guaranteeing isolation between them.
                 We also introduce novel mechanisms to deal with
                 scalability bottlenecks when there are a large number
                 of speculations. We demonstrate that for several
                 benchmarks, Multiverse achieves impressive speedups and
                 good scalability across entire clusters. We study the
                 overheads of the runtime and demonstrate how our
                 special scalability mechanisms are crucial in scaling
                 cluster wide.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Clebsch:2013:FCG,
  author =       "Sylvan Clebsch and Sophia Drossopoulou",
  title =        "Fully concurrent garbage collection of actors on
                 many-core machines",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "553--570",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509557",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Disposal of dead actors in actor-model languages is as
                 important as disposal of unreachable objects in
                 object-oriented languages. In current practice,
                 programmers are required to either manually terminate
                 actors, or they have to rely on garbage collection
                 systems that monitor actor mutation through write
                 barriers, thread coordination through locks etc. These
                 techniques, however, prevent the collector from being
                 fully concurrent. We developed a protocol that allows
                 garbage collection to run fully concurrently with all
                 actors. The main challenges in concurrent garbage
                 collection is the detection of cycles of sleeping
                 actors in the actors graph, in the presence of
                 concurrent mutation of this graph. Our protocol is
                 solely built on message passing: it uses deferred
                 direct reference counting, a dedicated actor for the
                 detection of (cyclic) garbage, and a confirmation
                 protocol (to deal with the mutation of the actor
                 graph). We present our ideas informally through an
                 example, and then present a formal model, prove
                 soundness and argue completeness. We have implemented
                 the protocol as part of a runtime library. As a
                 preliminary performance evaluation, we discuss the
                 performance of our approach as currently used at a
                 financial institution, and use four benchmarks from the
                 literature to compare our approach with other
                 actor-model systems. These preliminary results indicate
                 that the overhead of our approach is small.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhao:2013:INT,
  author =       "Jisheng Zhao and Roberto Lublinerman and Zoran
                 Budimli{\'c} and Swarat Chaudhuri and Vivek Sarkar",
  title =        "Isolation for nested task parallelism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "571--588",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Isolation--the property that a task can access shared
                 data without interference from other tasks--is one of
                 the most basic concerns in parallel programming.
                 Whilethere is a large body of past work on isolated
                 task-parallelism, the integration of isolation,
                 task-parallelism, and nesting of tasks has been a
                 difficult and unresolved challenge. In this pa- per, we
                 present a programming and execution model called Otello
                 where isolation is extended to arbitrarily nested
                 parallel tasks with irregular accesses to heap data. At
                 the same time, no additional burden is imposed on the
                 programmer, who only exposes parallelism by creating
                 and synchronizing parallel tasks, leaving the job of
                 ensuring isolation to the underlying compiler and
                 runtime system. Otello extends our past work on Aida
                 execution model and the delegated isolation mechanism
                 [22] to the setting of nested parallelism. The basic
                 runtime construct in Aida and Otello is an assembly: a
                 task equipped with a region in the shared heap that it
                 owns. When an assembly A conflicts with an assembly B,
                 A transfers--or delegates--its code and owned region to
                 a carefully selected assembly C in a way that will
                 ensure isolation with B, leaving the responsibility of
                 re-executing task A to C. The choice of C depends on
                 the nesting relationship between A and B.We have
                 implemented Otello on top of the Habanero Java (HJ)
                 parallel programming language [8], and used this
                 implementation to evaluate Otello on collections of
                 nested task-parallel benchmarks and non-nested
                 transactional benchmarks from past work. On the nested
                 task-parallel benchmarks, Otello achieves scalability
                 comparable to HJ programs without built-in isolation,
                 and the relative overhead of Otello is lower than that
                 of many published data-race detection algorithms that
                 detect the isolation violations (but do not enforce
                 isolation). For the transactional benchmarks, Otello
                 incurs lower overhead than a state-of-the-art software
                 transactional memory system (Deuce STM).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Tripp:2013:TNP,
  author =       "Omer Tripp and Eric Koskinen and Mooly Sagiv",
  title =        "Turning nondeterminism into parallelism",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "589--604",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509533",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Nondeterminism is a useful and prevalent concept in
                 the design and implementation of software systems. An
                 important property of nondeterminism is its latent
                 parallelism: A nondeterministic action can evaluate to
                 multiple behaviors. If at least one of these behaviors
                 does not conflict with concurrent tasks, then there is
                 an admissible execution of the action in parallel with
                 these tasks. Unfortunately, existing implementations of
                 the atomic paradigm --- optimistic as well as
                 pessimistic --- are unable to fully exhaust the
                 parallelism potential of nondeterministic actions,
                 lacking the means to guide concurrent tasks toward
                 nondeterministic choices that minimize interference.
                 This paper investigates the problem of utilizing
                 parallelism due to nondeterminism. We observe that
                 nondeterminism occurs in many real-world codes. We
                 motivate the need for devising coordination mechanisms
                 that can utilize available nondeterminism. We have
                 developed a system featuring such mechanisms, which
                 leverages nondeterminism in a wide class of query
                 operations, allowing a task to look into the future of
                 concurrent tasks that mutate the shared state during
                 query evaluation and reduce conflict accordingly. We
                 evaluate our system on a suite of 12 algorithmic
                 benchmarks of wide applicability, as well as an
                 industrial application. The results are encouraging.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chong:2013:BIS,
  author =       "Nathan Chong and Alastair F. Donaldson and Paul H. J.
                 Kelly and Jeroen Ketema and Shaz Qadeer",
  title =        "Barrier invariants: a shared state abstraction for the
                 analysis of data-dependent {GPU} kernels",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "605--622",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509517",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Data-dependent GPU kernels, whose data or control flow
                 are dependent on the input of the program, are
                 difficult to verify because they require reasoning
                 about shared state manipulated by many parallel
                 threads. Existing verification techniques for GPU
                 kernels achieve soundness and scalability by using a
                 two-thread reduction and making the contents of the
                 shared state nondeterministic each time threads
                 synchronise at a barrier, to account for all possible
                 thread interactions. This coarse abstraction prohibits
                 verification of data-dependent kernels. We present
                 barrier invariants, a novel abstraction technique which
                 allows key properties about the shared state of a
                 kernel to be preserved across barriers during formal
                 reasoning. We have integrated barrier invariants with
                 the GPUVerify tool, and present a detailed case study
                 showing how they can be used to verify three prefix sum
                 algorithms, allowing efficient modular verification of
                 a stream compaction kernel, a key building block for
                 GPU programming. This analysis goes significantly
                 beyond what is possible using existing verification
                 techniques for GPU kernels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Choi:2013:GGT,
  author =       "Wontae Choi and George Necula and Koushik Sen",
  title =        "Guided {GUI} testing of {Android} apps with minimal
                 restart and approximate learning",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "623--640",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509552",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Smartphones and tablets with rich graphical user
                 interfaces (GUI) are becoming increasingly popular.
                 Hundreds of thousands of specialized applications,
                 called apps, are available for such mobile platforms.
                 Manual testing is the most popular technique for
                 testing graphical user interfaces of such apps. Manual
                 testing is often tedious and error-prone. In this
                 paper, we propose an automated technique, called
                 Swift-Hand, for generating sequences of test inputs for
                 Android apps. The technique uses machine learning to
                 learn a model of the app during testing, uses the
                 learned model to generate user inputs that visit
                 unexplored states of the app, and uses the execution of
                 the app on the generated inputs to refine the model. A
                 key feature of the testing algorithm is that it avoids
                 restarting the app, which is a significantly more
                 expensive operation than executing the app on a
                 sequence of inputs. An important insight behind our
                 testing algorithm is that we do not need to learn a
                 precise model of an app, which is often computationally
                 intensive, if our goal is to simply guide test
                 execution into unexplored parts of the state space. We
                 have implemented our testing algorithm in a publicly
                 available tool for Android apps written in Java. Our
                 experimental results show that we can achieve
                 significantly better coverage than traditional random
                 testing and L*-based testing in a given time budget.
                 Our algorithm also reaches peak coverage faster than
                 both random and L*-based testing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Azim:2013:TDF,
  author =       "Tanzirul Azim and Iulian Neamtiu",
  title =        "Targeted and depth-first exploration for systematic
                 testing of {Android} apps",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "641--660",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509549",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Systematic exploration of Android apps is an enabler
                 for a variety of app analysis and testing tasks.
                 Performing the exploration while apps run on actual
                 phones is essential for exploring the full range of app
                 capabilities. However, exploring real-world apps on
                 real phones is challenging due to non-determinism,
                 non-standard control flow, scalability and overhead
                 constraints. Relying on end-users to conduct the
                 exploration might not be very effective: we performed a
                 7-use study on popular Android apps, and found that the
                 combined 7-use coverage was 30.08\% of the app screens
                 and 6.46\% of the app methods. Prior approaches for
                 automated exploration of Android apps have run apps in
                 an emulator or focused on small apps whose source code
                 was available. To address these problems, we present
                 A$^3$ E, an approach and tool that allows substantial
                 Android apps to be explored systematically while
                 running on actual phones, yet without requiring access
                 to the app's source code. The key insight of our
                 approach is to use a static, taint-style, dataflow
                 analysis on the app bytecode in a novel way, to
                 construct a high-level control flow graph that captures
                 legal transitions among activities (app screens). We
                 then use this graph to develop an exploration strategy
                 named Targeted Exploration that permits fast, direct
                 exploration of activities, including activities that
                 would be difficult to reach during normal use. We also
                 developed a strategy named Depth-first Exploration that
                 mimics user actions for exploring activities and their
                 constituents in a slower, but more systematic way. To
                 measure the effectiveness of our techniques, we use two
                 metrics: activity coverage (number of screens explored)
                 and method coverage. Experiments with using our
                 approach on 25 popular Android apps including BBC News,
                 Gas Buddy, Amazon Mobile, YouTube, Shazam Encore, and
                 CNN, show that our exploration techniques achieve
                 59.39--64.11\% activity coverage and 29.53--36.46\%
                 method coverage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kansal:2013:LAB,
  author =       "Aman Kansal and Scott Saponas and A. J. Bernheim Brush
                 and Kathryn S. McKinley and Todd Mytkowicz and Ryder
                 Ziola",
  title =        "The latency, accuracy, and battery {(LAB)}
                 abstraction: programmer productivity and energy
                 efficiency for continuous mobile context sensing",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "661--676",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509541",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Emerging mobile applications that sense context are
                 poised to delight and entertain us with timely news and
                 events, health tracking, and social connections.
                 Unfortunately, sensing algorithms quickly drain the
                 phone's battery. Developers can overcome battery drain
                 by carefully optimizing context sensing but that makes
                 programming with context arduous and ties applications
                 to current sensing hardware. These types of
                 applications embody a twist on the classic tension
                 between programmer productivity and performance due to
                 their combination of requirements. This paper
                 identifies the latency, accuracy, battery (LAB)
                 abstraction to resolve this tension. We implement and
                 evaluate LAB in a system called Senergy. Developers
                 specify their LAB requirements independent of inference
                 algorithms and sensors. Senergy delivers energy
                 efficient context while meeting the requirements and
                 adapts as hardware changes. We demonstrate LAB's
                 expressiveness by using it to implement 22 context
                 sensing algorithms for four types of context (location,
                 driving, walking, and stationary) and six diverse
                 applications. To demonstrate LAB's energy
                 optimizations, we show often an order of magnitude
                 improvements in energy efficiency on applications
                 compared to prior approaches. This relatively simple,
                 priority based API, may serve as a blueprint for future
                 API design in an increasingly complex design space that
                 must tradeoff latency, accuracy, and efficiency to meet
                 application needs and attain portability across
                 evolving, sensor-rich, heterogeneous, and power
                 constrained hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bergan:2013:ICS,
  author =       "Tom Bergan and Luis Ceze and Dan Grossman",
  title =        "Input-covering schedules for multithreaded programs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "677--692",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509508",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We propose constraining multithreaded execution to
                 small sets of input-covering schedules, which we define
                 as follows: given a program $P$, we say that a set of
                 schedules $ \Sigma $ covers all inputs of program $P$
                 if, when given any input, $P$'s execution can be
                 constrained to some schedule in $ \Sigma $ and still
                 produce a semantically valid result. Our approach is to
                 first compute a small $ \Sigma $ for a given program
                 $P$, and then, at runtime, constrain $P$'s execution to
                 always follow some schedule in $ \Sigma $, and never
                 deviate. We have designed an algorithm that uses
                 symbolic execution to systematically enumerate a set of
                 input-covering schedules, $ \Sigma $. To deal with
                 programs that run for an unbounded length of time, we
                 partition execution into bounded epochs, find
                 input-covering schedules for each epoch in isolation,
                 and then piece the schedules together at runtime. We
                 have implemented this algorithm along with a
                 constrained execution runtime for pthreads programs,
                 and we report results Our approach has the following
                 advantage: because all possible runtime schedules are
                 known a priori, we can seek to validate the program by
                 thoroughly verifying each schedule in $ \Sigma $, in
                 isolation, without needing to reason about the huge
                 space of thread interleavings that arises due to
                 conventional nondeterministic execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bond:2013:OCC,
  author =       "Michael D. Bond and Milind Kulkarni and Man Cao and
                 Minjia Zhang and Meisam Fathi Salmi and Swarnendu
                 Biswas and Aritra Sengupta and Jipeng Huang",
  title =        "{OCTET}: capturing and controlling cross-thread
                 dependences efficiently",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "693--712",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509519",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Parallel programming is essential for reaping the
                 benefits of parallel hardware, but it is notoriously
                 difficult to develop and debug reliable, scalable
                 software systems. One key challenge is that modern
                 languages and systems provide poor support for ensuring
                 concurrency correctness properties --- atomicity,
                 sequential consistency, and multithreaded determinism
                 --- because all existing approaches are impractical.
                 Dynamic, software-based approaches slow programs by up
                 to an order of magnitude because capturing and
                 controlling cross-thread dependences (i.e., conflicting
                 accesses to shared memory) requires synchronization at
                 virtually every access to potentially shared memory.
                 This paper introduces a new software-based concurrency
                 control mechanism called OCTET that soundly captures
                 cross-thread dependences and can be used to build
                 dynamic analyses for concurrency correctness. OCTET
                 achieves low overheads by tracking the locality state
                 of each potentially shared object. Non-conflicting
                 accesses conform to the locality state and require no
                 synchronization; only conflicting accesses require a
                 state change and heavyweight synchronization. This
                 optimistic tradeoff leads to significant efficiency
                 gains in capturing cross-thread dependences: a
                 prototype implementation of OCTET in a high-performance
                 Java virtual machine slows real-world concurrent
                 programs by only 26\% on average. A dependence
                 recorder, suitable for record {\&} replay, built on top
                 of OCTET adds an additional 5\% overhead on average.
                 These results suggest that OCTET can provide a
                 foundation for developing low-overhead analyses that
                 check and enforce concurrency correctness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Noll:2013:OFD,
  author =       "Albert Noll and Thomas Gross",
  title =        "Online feedback-directed optimizations for parallel
                 {Java} code",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "713--728",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "The performance of parallel code significantly depends
                 on the parallel task granularity (PTG). If the PTG is
                 too coarse, performance suffers due to load imbalance;
                 if the PTG is too fine, performance suffers from the
                 overhead that is induced by parallel task creation and
                 scheduling. This paper presents a software platform
                 that automatically determines the PTG at run-time.
                 Automatic PTG selection is enabled by concurrent calls,
                 which are special source language constructs that
                 provide a late decision (at run-time) of whether
                 concurrent calls are executed sequentially or
                 concurrently (as a parallel task). Furthermore, the
                 execution semantics of concurrent calls permits the
                 runtime system to merge two (or more) concurrent calls
                 thereby coarsening the PTG. We present an integration
                 of concurrent calls into the Java programming language,
                 the Java Memory Model, and show how the Java Virtual
                 Machine can adapt the PTG based on dynamic profiling.
                 The performance evaluation shows that our runtime
                 system performs competitively to Java programs for
                 which the PTG is tuned manually. Compared to an
                 unfortunate choice of the PTG, this approach performs
                 up to 3x faster than standard Java code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Herhut:2013:RTP,
  author =       "Stephan Herhut and Richard L. Hudson and Tatiana
                 Shpeisman and Jaswanth Sreeram",
  title =        "{River Trail}: a path to parallelism in {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "729--744",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509516",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "JavaScript is the most popular language on the web and
                 is a crucial component of HTML5 applications and
                 services that run on consumer platforms ranging from
                 desktops to phones. However, despite ample amount of
                 hardware parallelism available to web applications on
                 such platforms, JavaScript web applications remain
                 predominantly sequential. Common parallel programming
                 solutions accepted by other programming languages
                 failed to transfer themselves to JavaScript due to
                 differences in programming models, the additional
                 requirements of the web and different developer
                 expectations. In this paper we present River Trail ---
                 a parallel programming model and API for JavaScript
                 that provides safe, portable, programmer-friendly,
                 deterministic parallelism to JavaScript applications.
                 River Trail allows web applications to effectively
                 utilize multiple cores, vector instructions, and GPUs
                 on client platforms while allowing the web developer to
                 remain within the environment of JavaScript. We
                 describe the implementation of the River Trail compiler
                 and runtime and present experimental results that show
                 the impact of River Trail on performance and
                 scalability for a variety of realistic HTML5
                 applications. Our experiments show that River Trail has
                 a dramatic positive impact on overall performance and
                 responsiveness of computationally intense JavaScript
                 based applications achieving up to 33.6 times speedup
                 for kernels and up to 11.8 times speedup for realistic
                 web applications compared to sequential JavaScript.
                 Moreover, River Trail enables new interactive web
                 usages that are simply not even possible with standard
                 sequential JavaScript.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bhattacharya:2013:CCI,
  author =       "Suparna Bhattacharya and Kanchi Gopinath and Mangala
                 Gowri Nanda",
  title =        "Combining concern input with program analysis for
                 bloat detection",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "745--764",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Framework based software tends to get bloated by
                 accumulating optional features (or concerns )
                 just-in-case they are needed. The good news is that
                 such feature bloat need not always cause runtime
                 execution bloat. The bad news is that often enough,
                 only a few statements from an optional concern may
                 cause execution bloat that may result in as much as
                 50\% runtime overhead. We present a novel technique to
                 analyze the connection between optional concerns and
                 the potential sources of execution bloat induced by
                 them. Our analysis automatically answers questions such
                 as (1) whether a given set of optional concerns could
                 lead to execution bloat and (2) which particular
                 statements are the likely sources of bloat when those
                 concerns are not required. The technique combines
                 coarse grain concern input from an external source with
                 a fine-grained static analysis. Our experimental
                 evaluation highlights the effectiveness of such concern
                 augmented program analysis in execution bloat
                 assessment of ten programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhang:2013:IMF,
  author =       "Lingming Zhang and Lu Zhang and Sarfraz Khurshid",
  title =        "Injecting mechanical faults to localize developer
                 faults for evolving software",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "765--784",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509551",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "This paper presents a novel methodology for localizing
                 faults in code as it evolves. Our insight is that the
                 essence of failure-inducing edits made by the developer
                 can be captured using mechanical program
                 transformations (e.g., mutation changes). Based on the
                 insight, we present the FIFL framework, which uses both
                 the spectrum information of edits (obtained using the
                 existing FaultTracer approach) as well as the potential
                 impacts of edits (simulated by mutation changes) to
                 achieve more accurate fault localization. We evaluate
                 FIFL on real-world repositories of nine Java projects
                 ranging from 5.7KLoC to 88.8KLoC. The experimental
                 results show that FIFL is able to outperform the
                 state-of-the-art FaultTracer technique for localizing
                 failure-inducing program edits significantly. For
                 example, all 19 FIFL strategies that use both the
                 spectrum information and simulated impact information
                 for each edit outperform the existing FaultTracer
                 approach statistically at the significance level of
                 0.01. In addition, FIFL with its default settings
                 outperforms FaultTracer by 2.33\% to 86.26\% on 16 of
                 the 26 studied version pairs, and is only inferior than
                 FaultTracer on one version pair.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Deng:2013:ECB,
  author =       "Dongdong Deng and Wei Zhang and Shan Lu",
  title =        "Efficient concurrency-bug detection across inputs",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "785--802",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "In the multi-core era, it is critical to efficiently
                 test multi-threaded software and expose concurrency
                 bugs before software release. Previous work has made
                 significant progress in detecting and validating
                 concurrency bugs under a given input. Unfortunately,
                 software testing always faces large sets of test
                 inputs, and existing techniques are still too expensive
                 to be applied to every test input in practice. In this
                 paper, we use open-source software to study how
                 existing concurrency-bug detection tools work for a set
                 of inputs. The study shows that an interleaving
                 pattern, such as a data race or an atomicity violation,
                 can often be exposed by many inputs. Consequently,
                 existing bug detectors would inevitably waste their bug
                 detection effort to generate duplicate bug reports,
                 when applied to a set of inputs. Guided by the above
                 study, we propose a coverage metric, Concurrent
                 Function Pairs (CFP), to efficiently approximate how
                 interleavings overlap across inputs. Using CFP, we have
                 designed a new approach to detecting data races and
                 atomicity-violation bugs for a set of inputs. Our
                 evaluation on open-source C/C++ applications shows that
                 our CFP-guided approach can effectively accelerate
                 concurrency-bug detection for a set of inputs by
                 reducing redundant detection effort across inputs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhong:2013:DAD,
  author =       "Hao Zhong and Zhendong Su",
  title =        "Detecting {API} documentation errors",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "803--816",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509523",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "When programmers encounter an unfamiliar API library,
                 they often need to refer to its documentations,
                 tutorials, or discussions on development forums to
                 learn its proper usage. These API documents contain
                 valuable information, but may also mislead programmers
                 as they may contain errors ( e.g., broken code names
                 and obsolete code samples). Although most API documents
                 are actively maintained and updated, studies show that
                 many new and latent errors do exist. It is tedious and
                 error-prone to find such errors manually as API
                 documents can be enormous with thousands of pages.
                 Existing tools are ineffective in locating
                 documentation errors because traditional natural
                 language (NL) tools do not understand code names and
                 code samples, and traditional code analysis tools do
                 not understand NL sentences. In this paper, we propose
                 the first approach, DOCREF, specifically designed and
                 developed to detect API documentation errors. We
                 formulate a class of inconsistencies to indicate
                 potential documentation errors, and combine NL and code
                 analysis techniques to detect and report such
                 inconsistencies. We have implemented DOCREF and
                 evaluated its effectiveness on the latest
                 documentations of five widely-used API libraries.
                 DOCREF has detected more than 1,000 new documentation
                 errors, which we have reported to the authors. Many of
                 the errors have already been confirmed and fixed, after
                 we reported them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bao:2013:FDI,
  author =       "Tao Bao and Xiangyu Zhang",
  title =        "On-the-fly detection of instability problems in
                 floating-point program execution",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "817--832",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509526",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "The machine representation of floating point values
                 has limited precision such that errors may be
                 introduced during execution. These errors may get
                 propagated and magnified by the following operations,
                 leading to instability problems, e.g., control flow
                 path may be undesirably altered and faulty output may
                 be emitted. In this paper, we develop an on-the-fly
                 efficient monitoring technique that can predict if an
                 execution is stable. The technique does not explicitly
                 compute errors as doing so incurs high overhead.
                 Instead, it detects possible places where an error
                 becomes substantially inflated regarding the
                 corresponding value, and then tags the value with one
                 bit to denote that it has an inflated error. It then
                 tracks inflation bit propagation, taking care of
                 operations that may cut off such propagation. It
                 reports instability if any inflation bit reaches a
                 critical execution point, such as a predicate, where
                 the inflated error may induce substantial execution
                 difference, such as different execution paths. Our
                 experiment shows that with appropriate thresholds, the
                 technique can correctly detect that over 99.999996\% of
                 the inputs of all the programs we studied are stable
                 while a traditional technique relying solely on
                 inflation detection mistakenly classifies majority of
                 the inputs as unstable for some of the programs.
                 Compared to the state of the art technique that is
                 based on high precision computation and causes several
                 hundred times slowdown, our technique only causes 7.91
                 times slowdown on average and can report all the true
                 unstable executions with the appropriate thresholds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Coons:2013:BPO,
  author =       "Katherine E. Coons and Madan Musuvathi and Kathryn S.
                 McKinley",
  title =        "Bounded partial-order reduction",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "833--848",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509556",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "Eliminating concurrency errors is increasingly
                 important as systems rely more on parallelism for
                 performance. Exhaustively exploring the state-space of
                 a program's thread interleavings finds concurrency
                 errors and provides coverage guarantees, but suffers
                 from exponential state-space explosion. Two prior
                 approaches alleviate state-space explosion. (1) Dynamic
                 partial-order reduction (DPOR) provides full coverage
                 and explores only one interleaving of independent
                 transitions. (2) Bounded search provides bounded
                 coverage by enumerating interleavings that do not
                 exceed a bound. In particular, we focus on
                 preemption-bounding. Combining partial-order reduction
                 with preemption-bounding had remained an open problem.
                 We show that preemption-bounded search explores the
                 same partial orders repeatedly and consequently
                 explores more executions than unbounded DPOR, even for
                 small bounds. We further show that if DPOR simply uses
                 the preemption bound to prune the state space as it
                 explores new partial orders, it misses parts of the
                 state space reachable in the bound and is therefore
                 unsound. The bound essentially induces dependences
                 between otherwise independent transitions in the DPOR
                 state space. We introduce Bounded Partial Order
                 Reduction (BPOR), a modification of DPOR that
                 compensates for bound dependences. We identify
                 properties that determine how well bounds combine with
                 partial-order reduction. We prove sound coverage and
                 empirically evaluate BPOR with preemption and fairness
                 bounds. We show that by eliminating redundancies, BPOR
                 significantly reduces testing time compared to bounded
                 search. BPOR's faster incremental guarantees will help
                 testers verify larger concurrent programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Mitchell:2013:FCP,
  author =       "Nick Mitchell and Peter F. Sweeney",
  title =        "On-the-fly capacity planning",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "849--866",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509540",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "When resolving performance problems, a simple
                 histogram of hot call stacks does not cut it,
                 especially given the highly fluid nature of modern
                 deployments. Why bother tuning, when adding a few CPUs
                 via the management console will quickly resolve the
                 problem? The findings of these tools are also presented
                 without any sense of context: e.g. string conversion
                 may be expensive, but only matters if it contributes
                 greatly to the response time of user logins.
                 Historically, these concerns have been the purview of
                 capacity planning. The power of planners lies in their
                 ability to weigh demand versus capacity, and to do so
                 in terms of the important units of work in the
                 application (such as user logins). Unfortunately, they
                 rely on measurements of rates and latencies, and both
                 quantities are difficult to obtain. Even if possible,
                 when all is said and done, these planners only relate
                 to the code as a black-box: but, why bother adding
                 CPUs, when easy code changes will fix the problem? We
                 present a way to do planning on-the-fly: with a few
                 call stack samples taken from an already-running
                 system, we predict the benefit of a proposed tuning
                 plan. We accomplish this by simulating the effect of a
                 tuning action upon execution speed and the way it
                 shifts resource demand. To identify existing problems,
                 we show how to generate tuning actions automatically,
                 guided by the desire to maximize speedup without
                 needless expense, and that these generated plans may
                 span resource and code changes. We show that it is
                 possible to infer everything needed from these samples
                 alone: levels of resource demand and the units of work
                 in the application. We evaluate our planner on a suite
                 of microbenchmarks and a suite of 15,000 data sets that
                 come from real applications running in the wild.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Vafeiadis:2013:RSL,
  author =       "Viktor Vafeiadis and Chinmay Narayan",
  title =        "Relaxed separation logic: a program logic for {C11}
                 concurrency",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "10",
  pages =        "867--884",
  month =        oct,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2544173.2509532",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 09:19:33 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "OOPSLA '13 conference proceedings.",
  abstract =     "We introduce relaxed separation logic (RSL), the first
                 program logic for reasoning about concurrent programs
                 running under the C11 relaxed memory model. From a
                 user's perspective, RSL is an extension of concurrent
                 separation logic (CSL) with proof rules for the various
                 kinds of C11 atomic accesses. As in CSL, individual
                 threads are allowed to access non-atomically only the
                 memory that they own, thus preventing data races.
                 Ownership can, however, be transferred via certain
                 atomic accesses. For SC-atomic accesses, we permit
                 arbitrary ownership transfer; for acquire/release
                 atomic accesses, we allow ownership transfer only in
                 one direction; whereas for relaxed atomic accesses, we
                 rule out ownership transfer completely. We illustrate
                 RSL with a few simple examples and prove its soundness
                 directly over the axiomatic C11 weak memory model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Petrank:2013:SFA,
  author =       "Erez Petrank",
  title =        "Safety-first approach to memory consistency models",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "1--2",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2466479",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Reames:2013:THC,
  author =       "Philip Reames and George Necula",
  title =        "Towards hinted collection: annotations for decreasing
                 garbage collector pause times",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "3--14",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2464158",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "Garbage collection is widely used and has largely been
                 a boon for programmer productivity. However,
                 traditional garbage collection is approaching both
                 practical and theoretical performance limits. In
                 practice, the maximum heap size and heap structure of
                 large applications are influenced as much by garbage
                 collector behavior as by resource availability. We
                 present an alternate approach to garbage collection
                 wherein the programmer provides untrusted deallocation
                 hints. Usage of deallocation hints is similar to
                 trusted manual deallocation, but the consequence of an
                 inaccurate hint is lost performance not correctness.
                 Our hinted collector algorithm uses these hints to
                 identify a subset of unreachable objects with both
                 better parallel asymptotic complexity and practical
                 performance. On some benchmarks, our prototype
                 collector implementation achieves 10-20\% pause time
                 reductions. We close with a discussion of the design
                 trade-offs inherent in our approach and lessons to be
                 learned from our collector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Morikawa:2013:ASR,
  author =       "Kazuya Morikawa and Tomoharu Ugawa and Hideya
                 Iwasaki",
  title =        "Adaptive scanning reduces sweep time for the {Lisp2}
                 mark-compact garbage collector",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "15--26",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2466480",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "Mark-compact garbage collection helps long-running
                 programs avoid fragmentation. The Lisp2 mark-compact
                 collector is a classic but still widely-used compaction
                 algorithm. It sequentially scans the entire heap to
                 compact all live objects at one end of the heap while
                 preserving their order of addresses. Since the heap is
                 generally large, this scanning takes a long time.
                 Although some collectors adopt a separate bitmap into
                 which mark bits of objects are stored to reduce the
                 scanning time, we observed that scanning the bitmap can
                 take longer than scanning the heap if objects are
                 densely located. We propose a new scanning method from
                 this observation, which adaptively alternates methods
                 of scanning depending on heap usage; it scans those
                 parts of the heap where live objects are densely
                 located whereas it scans the bitmap for the remaining
                 parts. We implemented this scanning method in the Lisp2
                 collector of Jikes RVM. The experimental results
                 revealed that the adaptive scanner scanned faster than
                 the method that only scanned the heap and the method
                 that only scanned the bitmap.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{White:2013:CTP,
  author =       "David R. White and Jeremy Singer and Jonathan M.
                 Aitken and Richard E. Jones",
  title =        "Control theory for principled heap sizing",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "27--38",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2466481",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "We propose a new, principled approach to adaptive heap
                 sizing based on control theory. We review current
                 state-of-the-art heap sizing mechanisms, as deployed in
                 Jikes RVM and HotSpot. We then formulate heap sizing as
                 a control problem, apply and tune a standard controller
                 algorithm, and evaluate its performance on a set of
                 well-known benchmarks. We find our controller adapts
                 the heap size more responsively than existing
                 mechanisms. This responsiveness allows tighter virtual
                 machine memory footprints while preserving target
                 application throughput, which is ideal for both
                 embedded and utility computing domains. In short, we
                 argue that formal, systematic approaches to memory
                 management should be replacing ad-hoc heuristics as the
                 discipline matures. Control-theoretic heap sizing is
                 one such systematic approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Brock:2013:PPA,
  author =       "Jacob Brock and Xiaoming Gu and Bin Bao and Chen
                 Ding",
  title =        "{Pacman}: program-assisted cache management",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "39--50",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2466482",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "As caches become larger and shared by an increasing
                 number of cores, cache management is becoming more
                 important. This paper explores collaborative caching,
                 which uses software hints to influence hardware
                 caching. Recent studies have shown that such
                 collaboration between software and hardware can
                 theoretically achieve optimal cache replacement on
                 LRU-like cache. This paper presents Pacman, a practical
                 solution for collaborative caching in loop-based code.
                 Pacman uses profiling to analyze patterns in an optimal
                 caching policy in order to determine which data to
                 cache and at what time. It then splits each loop into
                 different parts at compile time. At run time, the loop
                 boundary is adjusted to selectively store data that
                 would be stored in an optimal policy. In this way,
                 Pacman emulates the optimal policy wherever it can.
                 Pacman requires a single bit at the load and store
                 instructions. Some of the current hardware has partial
                 support. This paper presents results using both
                 simulated and real systems, and compares simulated
                 results to related caching policies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wang:2013:GSE,
  author =       "Yan Wang and Iulian Neamtiu and Rajiv Gupta",
  title =        "Generating sound and effective memory debuggers",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "51--62",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2464159",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "We present a new approach for constructing debuggers
                 based on declarative specification of bug conditions
                 and root causes, and automatic generation of debugger
                 code. We illustrate our approach on several classes of
                 bugs, memory or otherwise. For each bug class, bug
                 conditions and their root cause are specified
                 declaratively, in First-order logic, using 1 to 4
                 predicates. We employ a low-level operational semantics
                 and abstract traces to permit concise bug specification
                 and prove soundness. To facilitate locating bugs, we
                 introduce a new concept of value propagation chains
                 that reduce programmer burden by narrowing the fault to
                 a handful of executed instructions (1 to 16 in our
                 experiments). We employ automatic translation to
                 generate the debugger implementation, which runs on top
                 of the Pin infrastructure. Experiments with using our
                 system on 7 versions of 4 real-world programs show that
                 our approach is expressive, effective at finding bugs
                 and their causes, and efficient. We believe that, using
                 our approach, other kinds of declaratively-specified,
                 provably-correct, auto-generated debuggers can be
                 constructed with little effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kalibera:2013:RBR,
  author =       "Tomas Kalibera and Richard Jones",
  title =        "Rigorous benchmarking in reasonable time",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "63--74",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2464160",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "Experimental evaluation is key to systems research.
                 Because modern systems are complex and
                 non-deterministic, good experimental methodology
                 demands that researchers account for uncertainty. To
                 obtain valid results, they are expected to run many
                 iterations of benchmarks, invoke virtual machines (VMs)
                 several times, or even rebuild VM or benchmark binaries
                 more than once. All this repetition costs time to
                 complete experiments. Currently, many evaluations give
                 up on sufficient repetition or rigorous statistical
                 methods, or even run benchmarks only in training sizes.
                 The results reported often lack proper variation
                 estimates and, when a small difference between two
                 systems is reported, some are simply unreliable. In
                 contrast, we provide a statistically rigorous
                 methodology for repetition and summarising results that
                 makes efficient use of experimentation time. Time
                 efficiency comes from two key observations. First, a
                 given benchmark on a given platform is typically prone
                 to much less non-determinism than the common worst-case
                 of published corner-case studies. Second, repetition is
                 most needed where most uncertainty arises (whether
                 between builds, between executions or between
                 iterations). We capture experimentation cost with a
                 novel mathematical model, which we use to identify the
                 number of repetitions at each level of an experiment
                 necessary and sufficient to obtain a given level of
                 precision. We present our methodology as a cookbook
                 that guides researchers on the number of repetitions
                 they should run to obtain reliable results. We also
                 show how to present results with an effect size
                 confidence interval. As an example, we show how to use
                 our methodology to conduct throughput experiments with
                 the DaCapo and SPEC CPU benchmarks on three recent
                 platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Aigner:2013:ATU,
  author =       "Martin Aigner and Christoph M. Kirsch",
  title =        "{ACDC}: towards a universal mutator for benchmarking
                 heap management systems",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "75--84",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2464161",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "We present ACDC, an open-source benchmark that may be
                 configured to emulate explicit single- and
                 multi-threaded memory allocation, sharing, access, and
                 deallocation behavior to expose virtually any relevant
                 allocator performance differences. ACDC mimics periodic
                 memory allocation and deallocation (AC) as well as
                 persistent memory (DC). Memory may be allocated
                 thread-locally and shared among multiple threads to
                 study multicore scalability and even false sharing.
                 Memory may be deallocated by threads other than the
                 allocating threads to study blowup memory
                 fragmentation. Memory may be accessed and deallocated
                 sequentially in allocation order or in tree-like
                 traversals to expose allocator deficiencies in
                 exploiting spatial locality. We demonstrate ACDC's
                 capabilities with seven state-of-the-art allocators for
                 C/C++ in an empirical study which also reveals
                 interesting performance differences between the
                 allocators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2013:PSC,
  author =       "Lian Li and Cristina Cifuentes and Nathan Keynes",
  title =        "Precise and scalable context-sensitive pointer
                 analysis via value flow graph",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "85--96",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2466483",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "In this paper, we propose a novel method for
                 context-sensitive pointer analysis using the value flow
                 graph (VFG) formulation. We achieve context-sensitivity
                 by simultaneously applying function cloning and
                 computing context-free language reachability
                 (CFL-reachability) in a novel way. In contrast to
                 existing clone-based and CFL-based approaches,
                 flow-sensitivity is easily integrated in our approach
                 by using a flow-sensitive VFG where each value flow
                 edge is computed in a flow-sensitive manner. We apply
                 context-sensitivity to both local variables and heap
                 objects and propose a new approximation for heap
                 cloning. We prove that our approach can achieve
                 context-sensitivity without loss of precision, i.e., it
                 is as precise as inlining all function calls. We
                 develop an efficient algorithm and implement a
                 context-, flow-, and field-sensitive pointer analysis
                 with heap cloning support in LLVM. We evaluate the
                 efficiency and precision of our implementation using
                 standard SPEC CPU2006 benchmarks. Our experimental
                 results show that the analysis is much faster than
                 existing approaches, it scales well to large real-world
                 applications, and it enables more effective compiler
                 optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ravitch:2013:AMO,
  author =       "Tristan Ravitch and Ben Liblit",
  title =        "Analyzing memory ownership patterns in {C} libraries",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "97--108",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2464162",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "Programs written in multiple languages are known as
                 polyglot programs. In part due to the proliferation of
                 new and productive high-level programming languages,
                 these programs are becoming more common in environments
                 that must interoperate with existing systems. Polyglot
                 programs must manage resource lifetimes across language
                 boundaries. Resource lifetime management bugs can lead
                 to leaks and crashes, which are more difficult to debug
                 in polyglot programs than monoglot programs. We present
                 analyses to automatically infer the ownership semantics
                 of C libraries. The results of these analyses can be
                 used to generate bindings to C libraries that
                 intelligently manage resources, to check the
                 correctness of polyglot programs, and to document the
                 interfaces of C libraries. While these analyses are
                 unsound and incomplete, we demonstrate that they
                 significantly reduce the manual annotation burden for a
                 suite of fifteen open source libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ricci:2013:ETP,
  author =       "Nathan P. Ricci and Samuel Z. Guyer and J. Eliot B.
                 Moss",
  title =        "{Elephant Tracks}: portable production of complete and
                 precise {GC} traces",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "109--118",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2466484",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "We present Elephant Tracks (ET), a dynamic program
                 analysis tool for Java that produces detailed traces of
                 garbage collection-related events, including object
                 allocations, object deaths, and pointer updates. Like
                 prior work, our tracing tool is based on the Merlin
                 algorithm [6,7], but offers several substantial new
                 capabilities. First, it is much more precise than
                 previous tools: it traces method entries and exits and
                 measures time in terms of them, allowing it to place
                 events precisely in the context of the program
                 structure. Second, it is implemented using a
                 combination of JVM Tool Interface (JVMTI)[13] callbacks
                 and bytecode rewriting, and works with any standard
                 JVM. Finally, it produces complete traces, including
                 weak references, events from the Java Native Interface
                 and sun.misc.Unsafe, and VM start up objects. In this
                 paper we also explore the general design space of
                 tracing tools, and carefully define the execution model
                 that the traces represent.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bu:2013:BAD,
  author =       "Yingyi Bu and Vinayak Borkar and Guoqing Xu and
                 Michael J. Carey",
  title =        "A bloat-aware design for big data applications",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "11",
  pages =        "119--130",
  month =        nov,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2555670.2466485",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Dec 9 08:04:34 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "ISMM '13 conference proceedings.",
  abstract =     "Over the past decade, the increasing demands on
                 data-driven business intelligence have led to the
                 proliferation of large-scale, data-intensive
                 applications that often have huge amounts of data
                 (often at terabyte or petabyte scale) to process. An
                 object-oriented programming language such as Java is
                 often the developer's choice for implementing such
                 applications, primarily due to its quick development
                 cycle and rich community resource. While the use of
                 such languages makes programming easier, significant
                 performance problems can often be seen --- the
                 combination of the inefficiencies inherent in a managed
                 run-time system and the impact of the huge amount of
                 data to be processed in the limited memory space often
                 leads to memory bloat and performance degradation at a
                 surprisingly early stage. This paper proposes a
                 bloat-aware design paradigm towards the development of
                 efficient and scalable Big Data applications in
                 object-oriented GC enabled languages. To motivate this
                 work, we first perform a study on the impact of several
                 typical memory bloat patterns. These patterns are
                 summarized from the user complaints on the mailing
                 lists of two widely-used open-source Big Data
                 applications. Next, we discuss our design paradigm to
                 eliminate bloat. Using examples and real-world
                 experience, we demonstrate that programming under this
                 paradigm does not incur significant programming burden.
                 We have implemented a few common data processing tasks
                 both using this design and using the conventional
                 object-oriented design. Our experimental results show
                 that this new design paradigm is extremely effective in
                 improving performance --- even for the moderate-size
                 data sets processed, we have observed 2.5x+ performance
                 gains, and the improvement grows substantially with the
                 size of the data set.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ankner:2013:EAH,
  author =       "Johan Ankner and Josef David Svenningsson",
  title =        "An {EDSL} approach to high performance {Haskell}
                 programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "1--12",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503789",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "This paper argues for a new methodology for writing
                 high performance Haskell programs by using Embedded
                 Domain Specific Languages. We exemplify the methodology
                 by describing a complete library, meta-repa, which is a
                 reimplementation of parts of the repa library. The
                 paper describes the implementation of meta-repa and
                 contrasts it with the standard approach to writing high
                 performance libraries. We conclude that even though the
                 embedded language approach has an initial cost of
                 defining the language and some syntactic overhead it
                 gives a more tailored programming model, stronger
                 performance guarantees, better control over
                 optimizations, simpler implementation of fusion and
                 inlining and allows for moving type level programming
                 down to value level programming in some cases. We also
                 provide benchmarks showing that meta-repa is as fast,
                 or faster, than repa. Furthermore, meta-repa also
                 includes push arrays and we demonstrate their
                 usefulness for writing certain high performance kernels
                 such as FFT.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bernardy:2013:NFP,
  author =       "Jean-Philippe Bernardy and Nicolas Pouillard",
  title =        "Names for free: polymorphic views of names and
                 binders",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "13--24",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503780",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "We propose a novel technique to represent names and
                 binders in Haskell. The dynamic (run-time)
                 representation is based on de Bruijn indices, but it
                 features an interface to write and manipulate variables
                 conveniently, using Haskell-level lambdas and
                 variables. The key idea is to use rich types: a subterm
                 with an additional free variable is viewed either as $
                 \forall \nu . \nu \to {\rm Term}(a + \nu) $ or $
                 \exists \nu . \nu \times {\rm Term}(a + v) $ depending
                 on whether it is constructed or analysed. We
                 demonstrate on a number of examples how this approach
                 permits to express term construction and manipulation
                 in a natural way, while retaining the good properties
                 of representations based on de Bruijn indices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bird:2013:UIT,
  author =       "Richard Bird and Jeremy Gibbons and Stefan Mehner and
                 Janis Voigtl{\"a}nder and Tom Schrijvers",
  title =        "Understanding idiomatic traversals backwards and
                 forwards",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "25--36",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503781",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "We present new ways of reasoning about a particular
                 class of effectful Haskell programs, namely those
                 expressed as idiomatic traversals. Starting out with a
                 specific problem about labelling and unlabelling binary
                 trees, we extract a general inversion law, applicable
                 to any monad, relating a traversal over the elements of
                 an arbitrary traversable type to a traversal that goes
                 in the opposite direction. This law can be invoked to
                 show that, in a suitable sense, unlabelling is the
                 inverse of labelling. The inversion law, as well as a
                 number of other properties of idiomatic traversals, is
                 a corollary of a more general theorem characterising
                 traversable functors as finitary containers: an
                 arbitrary traversable object can be decomposed uniquely
                 into shape and contents, and traversal be understood in
                 terms of those. Proof of the theorem involves the
                 properties of traversal in a special idiom related to
                 the free applicative functor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Blazevic:2013:ASM,
  author =       "Mario Bla{\v{z}}evi{\'c}",
  title =        "Adding structure to monoids: thus hopefully ending
                 {Haskell}'s string type confusion",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "37--46",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503785",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "This paper presents the rationale and design of
                 monoid-subclasses. This Haskell library consists of a
                 collection of type classes that generalize the
                 interface of several common data types, most
                 importantly those used to represent strings. We
                 demonstrate that the mathematical theory behind
                 monoid-subclasses can bring substantial practical
                 benefits to the Haskell library ecosystem by
                 generalizing attoparsec, one of the most popular
                 Haskell parsing libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Claessen:2013:SPN,
  author =       "Koen Claessen and Michal H. Palka",
  title =        "Splittable pseudorandom number generators using
                 cryptographic hashing",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "47--58",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503784",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "We propose a new splittable pseudorandom number
                 generator (PRNG) based on a cryptographic hash
                 function. Splittable PRNGs, in contrast to linear
                 PRNGs, allow the creation of two (seemingly)
                 independent generators from a given random number
                 generator. Splittable PRNGs are very useful for
                 structuring purely functional programs, as they avoid
                 the need for threading around state. We show that the
                 currently known and used splittable PRNGs are either
                 not efficient enough, have inherent flaws, or lack
                 formal arguments about their randomness. In contrast,
                 our proposed generator can be implemented efficiently,
                 and comes with a formal statements and proofs that
                 quantify how 'random' the results are that are
                 generated. The provided proofs give strong randomness
                 guarantees under assumptions commonly made in
                 cryptography.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kiselyov:2013:EEA,
  author =       "Oleg Kiselyov and Amr Sabry and Cameron Swords",
  title =        "Extensible effects: an alternative to monad
                 transformers",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "59--70",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503791",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "We design and implement a library that solves the
                 long-standing problem of combining effects without
                 imposing restrictions on their interactions (such as
                 static ordering). Effects arise from interactions
                 between a client and an effect handler (interpreter);
                 interactions may vary throughout the program and
                 dynamically adapt to execution conditions. Existing
                 code that relies on monad transformers may be used with
                 our library with minor changes, gaining efficiency over
                 long monad stacks. In addition, our library has greater
                 expressiveness, allowing for practical idioms that are
                 inefficient, cumbersome, or outright impossible with
                 monad transformers. Our alternative to a monad
                 transformer stack is a single monad, for the
                 coroutine-like communication of a client with its
                 handler. Its type reflects possible requests, i.e.,
                 possible effects of a computation. To support arbitrary
                 effects and their combinations, requests are values of
                 an extensible union type, which allows adding and,
                 notably, subtracting summands. Extending and, upon
                 handling, shrinking of the union of possible requests
                 is reflected in its type, yielding a type-and-effect
                 system for Haskell. The library is lightweight,
                 generalizing the extensible exception handling to other
                 effects and accurately tracking them in types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Leslie-Hurd:2013:MVS,
  author =       "Joe Leslie-Hurd",
  title =        "Maintaining verified software",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "71--80",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503787",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "Maintaining software in the face of evolving
                 dependencies is a challenging problem, and in addition
                 to good release practices there is a need for automatic
                 dependency analysis tools to avoid errors creeping in.
                 Verified software reveals more semantic information in
                 the form of mechanized proofs of functional
                 specifications, and this can be used for dependency
                 analysis. In this paper we present a scheme for
                 automatic dependency analysis of verified software,
                 which for each program checks that the collection of
                 installed libraries is sufficient to guarantee its
                 functional correctness. We illustrate the scheme with a
                 case study of Haskell packages verified in higher order
                 logic. The dependency analysis reduces the burden of
                 maintaining verified Haskell packages by automatically
                 computing version ranges for the packages they depend
                 on, such that any combination provides the
                 functionality required for correct operation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lindley:2013:HPP,
  author =       "Sam Lindley and Conor McBride",
  title =        "{Hasochism}: the pleasure and pain of dependently
                 typed {Haskell} programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "81--92",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503786",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "Haskell's type system has outgrown its Hindley-Milner
                 roots to the extent that it now stretches to the basics
                 of dependently typed programming. In this paper, we
                 collate and classify techniques for programming with
                 dependent types in Haskell, and contribute some new
                 ones. In particular, through extended examples ---
                 merge-sort and rectangular tilings --- we show how to
                 exploit Haskell's constraint solver as a theorem
                 prover, delivering code which, as Agda programmers, we
                 envy. We explore the compromises involved in simulating
                 variations on the theme of the dependent function space
                 in an attempt to help programmers put dependent types
                 to work, and to inform the evolving language design
                 both of Haskell and of dependently typed languages more
                 broadly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lippmeier:2013:DFF,
  author =       "Ben Lippmeier and Manuel M. T. Chakravarty and
                 Gabriele Keller and Amos Robinson",
  title =        "Data flow fusion with series expressions in
                 {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "93--104",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503782",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "Existing approaches to array fusion can deal with
                 straight-line producer consumer pipelines, but cannot
                 fuse branching data flows where a generated array is
                 consumed by several different consumers. Branching data
                 flows are common and natural to write, but a lack of
                 fusion leads to the creation of an intermediate array
                 at every branch point. We present a new array fusion
                 system that handles branches, based on Waters's series
                 expression framework, but extended to work in a
                 functional setting. Our system also solves a related
                 problem in stream fusion, namely the introduction of
                 duplicate loop counters. We demonstrate speedup over
                 existing fusion systems for several key examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liu:2013:ILH,
  author =       "Hai Liu and Neal Glew and Leaf Petersen and Todd A.
                 Anderson",
  title =        "The {Intel} labs {Haskell} research compiler",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "105--116",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503779",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "The Glasgow Haskell Compiler (GHC) is a well supported
                 optimizing compiler for the Haskell programming
                 language, along with its own extensions to the language
                 and libraries. Haskell's lazy semantics imposes a
                 runtime model which is in general difficult to
                 implement efficiently. GHC achieves good performance
                 across a wide variety of programs via aggressive
                 optimization taking advantage of the lack of side
                 effects, and by targeting a carefully tuned virtual
                 machine. The Intel Labs Haskell Research Compiler uses
                 GHC as a frontend, but provides a new whole-program
                 optimizing backend by compiling the GHC intermediate
                 representation to a relatively generic functional
                 language compilation platform. We found that GHC's
                 external Core language was relatively easy to use, but
                 reusing GHC's libraries and achieving full
                 compatibility were harder. For certain classes of
                 programs, our platform provides substantial performance
                 benefits over GHC alone, performing $ 2 \times $ faster
                 than GHC with the LLVM backend on selected modern
                 performance-oriented benchmarks; for other classes of
                 programs, the benefits of GHC's tuned virtual machine
                 continue to outweigh the benefits of more aggressive
                 whole program optimization. Overall we achieve parity
                 with GHC with the LLVM backend. In this paper, we
                 describe our Haskell compiler stack, its implementation
                 and optimization approach, and present benchmark
                 results comparing it to GHC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{vanderPloeg:2013:MFR,
  author =       "Atze van der Ploeg",
  title =        "Monadic functional reactive programming",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "117--128",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503783",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "Functional Reactive Programming (FRP) is a way to
                 program reactive systems in functional style,
                 eliminating many of the problems that arise from
                 imperative techniques. In this paper, we present an
                 alternative FRP formulation that is based on the notion
                 of a reactive computation: a monadic computation which
                 may require the occurrence of external events to
                 continue. A signal computation is a reactive
                 computation that may also emit values. In contrast to
                 signals in other FRP formulations, signal computations
                 can end, leading to a monadic interface for sequencing
                 signal phases. This interface has several advantages:
                 routing is implicit, sequencing signal phases is easier
                 and more intuitive than when using the switching
                 combinators found in other FRP approaches, and dynamic
                 lists require much less boilerplate code. In other FRP
                 approaches, either the entire FRP expression is
                 re-evaluated on each external stimulus, or impure
                 techniques are used to prevent redundant
                 re-computations. We show how Monadic FRP can be
                 implemented straightforwardly in a purely functional
                 way while preventing redundant re-computations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Voellmy:2013:MHP,
  author =       "Andreas Richard Voellmy and Junchang Wang and Paul
                 Hudak and Kazuhiko Yamamoto",
  title =        "{Mio}: a high-performance multicore {IO} manager for
                 {GHC}",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "129--140",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503790",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "Haskell threads provide a key, lightweight concurrency
                 abstraction to simplify the programming of important
                 network applications such as web servers and
                 software-defined network (SDN) controllers. The
                 flagship Glasgow Haskell Compiler (GHC) introduces a
                 run-time system (RTS) to achieve a high-performance
                 multicore implementation of Haskell threads, by
                 introducing effective components such as a multicore
                 scheduler, a parallel garbage collector, an IO manager,
                 and efficient multicore memory allocation. Evaluations
                 of the GHC RTS, however, show that it does not scale
                 well on multicore processors, leading to poor
                 performance of many network applications that try to
                 use lightweight Haskell threads. In this paper, we show
                 that the GHC IO manager, which is a crucial component
                 of the GHC RTS, is the scaling bottleneck. Through a
                 series of experiments, we identify key data structure,
                 scheduling, and dispatching bottlenecks of the GHC IO
                 manager. We then design a new multicore IO manager
                 named Mio that eliminates all these bottlenecks. Our
                 evaluations show that the new Mio manager improves
                 realistic web server throughput by 6.5x and reduces
                 expected web server response time by 5.7x. We also show
                 that with Mio, McNettle (an SDN controller written in
                 Haskell) can scale effectively to 40+ cores, reach a
                 throughput of over 20 million new requests per second
                 on a single machine, and hence become the fastest of
                 all existing SDN controllers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Wortmann:2013:COH,
  author =       "Peter M. Wortmann and David Duke",
  title =        "Causality of optimized {Haskell}: what is burning our
                 cycles?",
  journal =      j-SIGPLAN,
  volume =       "48",
  number =       "12",
  pages =        "141--152",
  month =        dec,
  year =         "2013",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578854.2503788",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "Haskell '14 conference proceedings.",
  abstract =     "Profiling real-world Haskell programs is hard, as
                 compiler optimizations make it tricky to establish
                 causality between the source code and program behavior.
                 In this paper we attack the root issue by performing a
                 causality analysis of functional programs under
                 optimization. We apply our findings to build a novel
                 profiling infrastructure on top of the Glasgow Haskell
                 Compiler, allowing for performance analysis even of
                 aggressively optimized programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Birkedal:2014:MRA,
  author =       "Lars Birkedal",
  title =        "Modular reasoning about concurrent higher-order
                 imperative programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2537849",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cousot:2014:GCC,
  author =       "Patrick` Cousot and Radhia Cousot",
  title =        "A {Galois} connection calculus for abstract
                 interpretation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "3--4",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2537850",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We introduce a Galois connection calculus for language
                 independent specification of abstract interpretations
                 used in programming language semantics, formal
                 verification, and static analysis. This Galois
                 connection calculus and its type system are typed by
                 abstract interpretation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Castagna:2014:PFS,
  author =       "Giuseppe Castagna and Kim Nguyen and Zhiwu Xu and
                 Hyeonseung Im and Sergue{\"\i} Lenglet and Luca
                 Padovani",
  title =        "Polymorphic functions with set-theoretic types: part
                 1: syntax, semantics, and evaluation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "5--17",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535840",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "This article is the first part of a two articles
                 series about a calculus with higher-order polymorphic
                 functions, recursive types with arrow and product type
                 constructors and set-theoretic type connectives (union,
                 intersection, and negation). In this first part we
                 define and study the explicitly-typed version of the
                 calculus in which type instantiation is driven by
                 explicit instantiation annotations. In particular, we
                 define an explicitly-typed lambda-calculus with
                 intersection types and an efficient evaluation model
                 for it. In the second part, presented in a companion
                 paper, we define a local type inference system that
                 allows the programmer to omit explicit instantiation
                 annotations, and a type reconstruction system that
                 allows the programmer to omit explicit type
                 annotations. The work presented in the two articles
                 provides the theoretical foundations and technical
                 machinery needed to design and implement higher-order
                 polymorphic functional languages for semi-structured
                 data.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kilpatrick:2014:BRH,
  author =       "Scott Kilpatrick and Derek Dreyer and Simon Peyton
                 Jones and Simon Marlow",
  title =        "{Backpack}: retrofitting {Haskell} with interfaces",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "19--31",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535884",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Module systems like that of Haskell permit only a weak
                 form of modularity in which module implementations
                 depend directly on other implementations and must be
                 processed in dependency order. Module systems like that
                 of ML, on the other hand, permit a stronger form of
                 modularity in which explicit interfaces express
                 assumptions about dependencies, and each module can be
                 typechecked and reasoned about independently. In this
                 paper, we present Backpack, a new language for building
                 separately-typecheckable *packages* on top of a weak
                 module system like Haskell's. The design of Backpack is
                 inspired by the MixML module calculus of Rossberg and
                 Dreyer, but differs significantly in detail. Like
                 MixML, Backpack supports explicit interfaces and
                 recursive linking. Unlike MixML, Backpack supports a
                 more flexible applicative semantics of instantiation.
                 Moreover, its design is motivated less by foundational
                 concerns and more by the practical concern of
                 integration into Haskell, which has led us to advocate
                 simplicity --- in both the syntax and semantics of
                 Backpack --- over raw expressive power. The semantics
                 of Backpack packages is defined by elaboration to sets
                 of Haskell modules and binary interface files, thus
                 showing how Backpack maintains interoperability with
                 Haskell while extending it with separate typechecking.
                 Lastly, although Backpack is geared toward integration
                 into Haskell, its design and semantics are largely
                 agnostic with respect to the details of the underlying
                 core language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Casinghino:2014:CPP,
  author =       "Chris Casinghino and Vilhelm Sj{\"o}berg and Stephanie
                 Weirich",
  title =        "Combining proofs and programs in a dependently typed
                 language",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "33--45",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535883",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Most dependently-typed programming languages either
                 require that all expressions terminate (e.g. Coq, Agda,
                 and Epigram), or allow infinite loops but are
                 inconsistent when viewed as logics (e.g. Haskell, ATS,
                 $ \Omega $ ). Here, we combine these two approaches
                 into a single dependently-typed core language. The
                 language is composed of two fragments that share a
                 common syntax and overlapping semantics: a logic that
                 guarantees total correctness, and a call-by-value
                 programming language that guarantees type safety but
                 not termination. The two fragments may interact:
                 logical expressions may be used as programs; the logic
                 may soundly reason about potentially nonterminating
                 programs; programs can require logical proofs as
                 arguments; and ``mobile'' program values, including
                 proofs computed at runtime, may be used as evidence by
                 the logic. This language allows programmers to work
                 with total and partial functions uniformly, providing a
                 smooth path from functional programming to
                 dependently-typed programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dissegna:2014:TCA,
  author =       "Stefano Dissegna and Francesco Logozzo and Francesco
                 Ranzato",
  title =        "Tracing compilation by abstract interpretation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "47--59",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535866",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Tracing just-in-time compilation is a popular
                 compilation schema for the efficient implementation of
                 dynamic languages, which is commonly used for
                 JavaScript, Python, and PHP. It relies on two key
                 ideas. First, it monitors the execution of the program
                 to detect so-called hot paths, i.e., the most
                 frequently executed paths. Then, it uses some store
                 information available at runtime to optimize hot paths.
                 The result is a residual program where the optimized
                 hot paths are guarded by sufficient conditions ensuring
                 the equivalence of the optimized path and the original
                 program. The residual program is persistently mutated
                 during its execution, e.g., to add new optimized paths
                 or to merge existing paths. Tracing compilation is thus
                 fundamentally different than traditional static
                 compilation. Nevertheless, despite the remarkable
                 practical success of tracing compilation, very little
                 is known about its theoretical foundations. We
                 formalize tracing compilation of programs using
                 abstract interpretation. The monitoring (viz., hot path
                 detection) phase corresponds to an abstraction of the
                 trace semantics that captures the most frequent
                 occurrences of sequences of program points together
                 with an abstraction of their corresponding stores,
                 e.g., a type environment. The optimization (viz.,
                 residual program generation) phase corresponds to a
                 transform of the original program that preserves its
                 trace semantics up to a given observation as modeled by
                 some abstraction. We provide a generic framework to
                 express dynamic optimizations and to prove them
                 correct. We instantiate it to prove the correctness of
                 dynamic type specialization. We show that our framework
                 is more general than a recent model of tracing
                 compilation introduced in POPL~2011 by Guo and Palsberg
                 (based on operational bisimulations). In our model we
                 can naturally express hot path reentrance and common
                 optimizations like dead-store elimination, which are
                 either excluded or unsound in Guo and Palsberg's
                 framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ramsay:2014:TDA,
  author =       "Steven J. Ramsay and Robin P. Neatherway and C.-H.
                 Luke Ong",
  title =        "A type-directed abstraction refinement approach to
                 higher-order model checking",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "61--72",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535873",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "The trivial-automaton model checking problem for
                 higher-order recursion schemes has become a widely
                 studied object in connection with the automatic
                 verification of higher-order programs. The problem is
                 formidably hard: despite considerable progress in
                 recent years, no decision procedures have been
                 demonstrated to scale robustly beyond recursion schemes
                 that comprise more than a few hundred rewrite rules. We
                 present a new, fixed-parameter polynomial time
                 algorithm, based on a novel, type directed form of
                 abstraction refinement in which behaviours of a scheme
                 are distinguished by the abstraction according to the
                 intersection types that they inhabit (the properties
                 that they satisfy). Unlike other intersection type
                 approaches, our algorithm reasons both about acceptance
                 by the property automaton and acceptance by its dual,
                 simultaneously, in order to minimize the amount of work
                 done by converging on the solution to a problem
                 instance from both sides. We have constructed Preface,
                 a prototype implementation of the algorithm, and
                 assembled an extensive body of evidence to demonstrate
                 empirically that the algorithm readily scales to
                 recursion schemes of several thousand rules, well
                 beyond the capabilities of current state-of-the-art
                 higher-order model checkers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Coughlin:2014:FTA,
  author =       "Devin Coughlin and Bor-Yuh Evan Chang",
  title =        "Fissile type analysis: modular checking of almost
                 everywhere invariants",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "73--85",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535855",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We present a generic analysis approach to the
                 imperative relationship update problem, in which
                 destructive updates temporarily violate a global
                 invariant of interest. Such invariants can be
                 conveniently and concisely specified with dependent
                 refinement types, which are efficient to check
                 flow-insensitively. Unfortunately, while traditional
                 flow-insensitive type checking is fast, it is
                 inapplicable when the desired invariants can be
                 temporarily broken. To overcome this limitation, past
                 works have directly ratcheted up the complexity of the
                 type analysis and associated type invariants, leading
                 to inefficient analysis and verbose specifications. In
                 contrast, we propose a generic lifting of modular
                 refinement type analyses with a symbolic analysis to
                 efficiently and effectively check concise invariants
                 that hold almost everywhere. The result is an
                 efficient, highly modular flow-insensitive type
                 analysis to optimistically check the preservation of
                 global relationship invariants that can fall back to a
                 precise, disjunctive symbolic analysis when the
                 optimistic assumption is violated. This technique
                 permits programmers to temporarily break and then
                 re-establish relationship invariants--a flexibility
                 that is crucial for checking relationships in
                 real-world, imperative languages. A significant
                 challenge is selectively violating the global type
                 consistency invariant over heap locations, which we
                 achieve via almost type-consistent heaps. To evaluate
                 our approach, we have encoded the problem of verifying
                 the safety of reflective method calls in dynamic
                 languages as a refinement type checking problem. Our
                 analysis is capable of validating reflective call
                 safety at interactive speeds on commonly-used
                 Objective-C libraries and applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bodin:2014:TMJ,
  author =       "Martin Bodin and Arthur Chargueraud and Daniele
                 Filaretti and Philippa Gardner and Sergio Maffeis and
                 Daiva Naudziuniene and Alan Schmitt and Gareth Smith",
  title =        "A trusted mechanised {JavaScript} specification",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "87--100",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535876",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "JavaScript is the most widely used web language for
                 client-side applications. Whilst the development of
                 JavaScript was initially just led by implementation,
                 there is now increasing momentum behind the ECMA
                 standardisation process. The time is ripe for a formal,
                 mechanised specification of JavaScript, to clarify
                 ambiguities in the ECMA standards, to serve as a
                 trusted reference for high-level language compilation
                 and JavaScript implementations, and to provide a
                 platform for high-assurance proofs of language
                 properties. We present JSCert, a formalisation of the
                 current ECMA standard in the Coq proof assistant, and
                 JSRef, a reference interpreter for JavaScript extracted
                 from Coq to OCaml. We give a Coq proof that JSRef is
                 correct with respect to JSCert and assess JSRef using
                 test262, the ECMA conformance test suite. Our
                 methodology ensures that JSCert is a comparatively
                 accurate formulation of the English standard, which
                 will only improve as time goes on. We have demonstrated
                 that modern techniques of mechanised specification can
                 handle the complexity of JavaScript.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Krebbers:2014:OAS,
  author =       "Robbert Krebbers",
  title =        "An operational and axiomatic semantics for
                 non-determinism and sequence points in {C}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "101--112",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535878",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "The C11 standard of the C programming language does
                 not specify the execution order of expressions.
                 Besides, to make more effective optimizations possible
                 (e.g., delaying of side-effects and interleaving), it
                 gives compilers in certain cases the freedom to use
                 even more behaviors than just those of all execution
                 orders. Widely used C compilers actually exploit this
                 freedom given by the C standard for optimizations, so
                 it should be taken seriously in formal verification.
                 This paper presents an operational and axiomatic
                 semantics (based on separation logic) for
                 non-determinism and sequence points in C. We prove
                 soundness of our axiomatic semantics with respect to
                 our operational semantics. This proof has been fully
                 formalized using the Coq proof assistant.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Anderson:2014:NSF,
  author =       "Carolyn Jane Anderson and Nate Foster and Arjun Guha
                 and Jean-Baptiste Jeannin and Dexter Kozen and Cole
                 Schlesinger and David Walker",
  title =        "{NetKAT}: semantic foundations for networks",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "113--126",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535862",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Recent years have seen growing interest in high-level
                 languages for programming networks. But the design of
                 these languages has been largely ad hoc, driven more by
                 the needs of applications and the capabilities of
                 network hardware than by foundational principles. The
                 lack of a semantic foundation has left language
                 designers with little guidance in determining how to
                 incorporate new features, and programmers without a
                 means to reason precisely about their code. This paper
                 presents NetKAT, a new network programming language
                 that is based on a solid mathematical foundation and
                 comes equipped with a sound and complete equational
                 theory. We describe the design of NetKAT, including
                 primitives for filtering, modifying, and transmitting
                 packets; union and sequential composition operators;
                 and a Kleene star operator that iterates programs. We
                 show that NetKAT is an instance of a canonical and
                 well-studied mathematical structure called a Kleene
                 algebra with tests (KAT) and prove that its equational
                 theory is sound and complete with respect to its
                 denotational semantics. Finally, we present practical
                 applications of the equational theory including
                 syntactic techniques for checking reachability, proving
                 non-interference properties that ensure isolation
                 between programs, and establishing the correctness of
                 compilation algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sharma:2014:BVT,
  author =       "Rahul Sharma and Aditya V. Nori and Alex Aiken",
  title =        "Bias-variance tradeoffs in program analysis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "127--137",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535853",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "It is often the case that increasing the precision of
                 a program analysis leads to worse results. It is our
                 thesis that this phenomenon is the result of
                 fundamental limits on the ability to use precise
                 abstract domains as the basis for inferring strong
                 invariants of programs. We show that bias-variance
                 tradeoffs, an idea from learning theory, can be used to
                 explain why more precise abstractions do not
                 necessarily lead to better results and also provides
                 practical techniques for coping with such limitations.
                 Learning theory captures precision using a
                 combinatorial quantity called the VC dimension. We
                 compute the VC dimension for different abstractions and
                 report on its usefulness as a precision metric for
                 program analyses. We evaluate cross validation, a
                 technique for addressing bias-variance tradeoffs, on an
                 industrial strength program verification tool called
                 YOGI. The tool produced using cross validation has
                 significantly better running time, finds new defects,
                 and has fewer time-outs than the current production
                 version. Finally, we make some recommendations for
                 tackling bias-variance tradeoffs in program analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{DSilva:2014:AS,
  author =       "Vijay D'Silva and Leopold Haller and Daniel Kroening",
  title =        "Abstract satisfaction",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "139--150",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535868",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "This article introduces an abstract interpretation
                 framework that codifies the operations in SAT and SMT
                 solvers in terms of lattices, transformers and fixed
                 points. We develop the idea that a formula denotes a
                 set of models in a universe of structures. This set of
                 models has characterizations as fixed points of
                 deduction, abduction and quantification transformers. A
                 wide range of satisfiability procedures can be
                 understood as computing and refining approximations of
                 such fixed points. These include procedures in the DPLL
                 family, those for preprocessing and inprocessing in SAT
                 solvers, decision procedures for equality logics, weak
                 arithmetics, and procedures for approximate
                 quantification. Our framework provides a unified,
                 mathematical basis for studying and combining program
                 analysis and satisfiability procedures. A practical
                 benefit of our work is a new, logic-agnostic
                 architecture for implementing solvers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Farzan:2014:PC,
  author =       "Azadeh Farzan and Zachary Kincaid and Andreas
                 Podelski",
  title =        "Proofs that count",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "151--164",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535885",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Counting arguments are among the most basic proof
                 methods in mathematics. Within the field of formal
                 verification, they are useful for reasoning about
                 programs with infinite control, such as programs with
                 an unbounded number of threads, or (concurrent)
                 programs with recursive procedures. While counting
                 arguments are common in informal, hand-written proofs
                 of such programs, there are no fully automated
                 techniques to construct counting arguments. The key
                 questions involved in automating counting arguments
                 are: how to decide what should be counted?, and how to
                 decide when a counting argument is valid? In this
                 paper, we present a technique for automatically
                 constructing and checking counting arguments, which
                 includes novel solutions to these questions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{deAmorim:2014:VIF,
  author =       "Arthur Azevedo de Amorim and Nathan Collins and
                 Andr{\'e} DeHon and Delphine Demange and Catalin Hritcu
                 and David Pichardie and Benjamin C. Pierce and Randy
                 Pollack and Andrew Tolmach",
  title =        "A verified information-flow architecture",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "165--178",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535839",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "SAFE is a clean-slate design for a highly secure
                 computer system, with pervasive mechanisms for tracking
                 and limiting information flows. At the lowest level,
                 the SAFE hardware supports fine-grained programmable
                 tags, with efficient and flexible propagation and
                 combination of tags as instructions are executed. The
                 operating system virtualizes these generic facilities
                 to present an information-flow abstract machine that
                 allows user programs to label sensitive data with rich
                 confidentiality policies. We present a formal,
                 machine-checked model of the key hardware and software
                 mechanisms used to control information flow in SAFE and
                 an end-to-end proof of noninterference for this
                 model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kumar:2014:CVI,
  author =       "Ramana Kumar and Magnus O. Myreen and Michael Norrish
                 and Scott Owens",
  title =        "{CakeML}: a verified implementation of {ML}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "179--191",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535841",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We have developed and mechanically verified an ML
                 system called CakeML, which supports a substantial
                 subset of Standard ML. CakeML is implemented as an
                 interactive read-eval-print loop (REPL) in x86-64
                 machine code. Our correctness theorem ensures that this
                 REPL implementation prints only those results permitted
                 by the semantics of CakeML. Our verification effort
                 touches on a breadth of topics including lexing,
                 parsing, type checking, incremental and dynamic
                 compilation, garbage collection, arbitrary-precision
                 arithmetic, and compiler bootstrapping. Our
                 contributions are twofold. The first is simply in
                 building a system that is end-to-end verified,
                 demonstrating that each piece of such a verification
                 effort can in practice be composed with the others, and
                 ensuring that none of the pieces rely on any
                 over-simplifying assumptions. The second is developing
                 novel approaches to some of the more challenging
                 aspects of the verification. In particular, our
                 formally verified compiler can bootstrap itself: we
                 apply the verified compiler to itself to produce a
                 verified machine-code implementation of the compiler.
                 Additionally, our compiler proof handles diverging
                 input programs with a lightweight approach based on
                 logical timeout exceptions. The entire development was
                 carried out in the HOL4 theorem prover.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Barthe:2014:PRV,
  author =       "Gilles Barthe and C{\'e}dric Fournet and Benjamin
                 Gr{\'e}goire and Pierre-Yves Strub and Nikhil Swamy and
                 Santiago Zanella-B{\'e}guelin",
  title =        "Probabilistic relational verification for
                 cryptographic implementations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "193--205",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535847",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Relational program logics have been used for
                 mechanizing formal proofs of various cryptographic
                 constructions. With an eye towards scaling these
                 successes towards end-to-end security proofs for
                 implementations of distributed systems, we present RF*,
                 a relational extension of F*, a general-purpose
                 higher-order stateful programming language with a
                 verification system based on refinement types. The
                 distinguishing feature of F* is a relational Hoare
                 logic for a higher-order, stateful, probabilistic
                 language. Through careful language design, we adapt the
                 F* typechecker to generate both classic and relational
                 verification conditions, and to automatically discharge
                 their proofs using an SMT solver. Thus, we are able to
                 benefit from the existing features of F*, including its
                 abstraction facilities for modular reasoning about
                 program fragments. We evaluate RF* experimentally by
                 programming a series of cryptographic constructions and
                 protocols, and by verifying their security properties,
                 ranging from information flow to unlinkability,
                 integrity, and privacy. Moreover, we validate the
                 design of RF* by formalizing in Coq a core
                 probabilistic \lambda calculus and a relational
                 refinement type system and proving the soundness of the
                 latter against a denotational semantics of the
                 probabilistic lambda \lambda calculus.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chaudhuri:2014:BBQ,
  author =       "Swarat Chaudhuri and Martin Clochard and Armando
                 Solar-Lezama",
  title =        "Bridging boolean and quantitative synthesis using
                 smoothed proof search",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "207--220",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535859",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We present a new technique for parameter synthesis
                 under boolean and quantitative objectives. The input to
                 the technique is a ``sketch'' --- a program with
                 missing numerical parameters --- and a probabilistic
                 assumption about the program's inputs. The goal is to
                 automatically synthesize values for the parameters such
                 that the resulting program satisfies: (1) a {boolean
                 specification}, which states that the program must meet
                 certain assertions, and (2) a {quantitative
                 specification}, which assigns a real valued rating to
                 every program and which the synthesizer is expected to
                 optimize. Our method --- called smoothed proof search
                 --- reduces this task to a sequence of unconstrained
                 smooth optimization problems that are then solved
                 numerically. By iteratively solving these problems, we
                 obtain parameter values that get closer and closer to
                 meeting the boolean specification; at the limit, we
                 obtain values that provably meet the specification. The
                 approximations are computed using a new notion of
                 smoothing for program abstractions, where an abstract
                 transformer is approximated by a function that is
                 continuous according to a metric over abstract states.
                 We present a prototype implementation of our synthesis
                 procedure, and experimental results on two benchmarks
                 from the embedded control domain. The experiments
                 demonstrate the benefits of smoothed proof search over
                 an approach that does not meet the boolean and
                 quantitative synthesis goals simultaneously.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Beyene:2014:CBA,
  author =       "Tewodros Beyene and Swarat Chaudhuri and Corneliu
                 Popeea and Andrey Rybalchenko",
  title =        "A constraint-based approach to solving games on
                 infinite graphs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "221--233",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535860",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We present a constraint-based approach to computing
                 winning strategies in two-player graph games over the
                 state space of infinite-state programs. Such games have
                 numerous applications in program verification and
                 synthesis, including the synthesis of infinite-state
                 reactive programs and branching-time verification of
                 infinite-state programs. Our method handles games with
                 winning conditions given by safety, reachability, and
                 general Linear Temporal Logic (LTL) properties. For
                 each property class, we give a deductive proof rule
                 that --- provided a symbolic representation of the game
                 players --- describes a winning strategy for a
                 particular player. Our rules are sound and relatively
                 complete. We show that these rules can be automated by
                 using an off-the-shelf Horn constraint solver that
                 supports existential quantification in clause heads.
                 The practical promise of the rules is demonstrated
                 through several case studies, including a challenging
                 ``Cinderella-Stepmother game'' that allows infinite
                 alternation of discrete and continuous choices by two
                 players, as well as examples derived from prior work on
                 program repair and synthesis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Darulova:2014:SCR,
  author =       "Eva Darulova and Viktor Kuncak",
  title =        "Sound compilation of reals",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "235--248",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535874",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Writing accurate numerical software is hard because of
                 many sources of unavoidable uncertainties, including
                 finite numerical precision of implementations. We
                 present a programming model where the user writes a
                 program in a real-valued implementation and
                 specification language that explicitly includes
                 different types of uncertainties. We then present a
                 compilation algorithm that generates a finite-precision
                 implementation that is guaranteed to meet the desired
                 precision with respect to real numbers. Our compilation
                 performs a number of verification steps for different
                 candidate precisions. It generates verification
                 conditions that treat all sources of uncertainties in a
                 unified way and encode reasoning about finite-precision
                 roundoff errors into reasoning about real numbers. Such
                 verification conditions can be used as a standardized
                 format for verifying the precision and the correctness
                 of numerical programs. Due to their non-linear nature,
                 precise reasoning about these verification conditions
                 remains difficult and cannot be handled using
                 state-of-the art SMT solvers alone. We therefore
                 propose a new procedure that combines exact SMT solving
                 over reals with approximate and sound affine and
                 interval arithmetic. We show that this approach
                 overcomes scalability limitations of SMT solvers while
                 providing improved precision over affine and interval
                 arithmetic. Our implementation gives promising results
                 on several numerical models, including dynamical
                 systems, transcendental functions, and controller
                 implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Huet:2014:YRD,
  author =       "G{\'e}rard Huet and Hugo Herbelin",
  title =        "30 years of research and development around {Coq}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "249--249",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2537848",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Brookes:2014:ER,
  author =       "Stephen Brookes and Peter W. O'Hearn and Uday Reddy",
  title =        "The essence of {Reynolds}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "251--255",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2537851",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "John Reynolds (1935-2013) was a pioneer of programming
                 languages research. In this paper we pay tribute to the
                 man, his ideas, and his influence.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kuper:2014:FAW,
  author =       "Lindsey Kuper and Aaron Turon and Neelakantan R.
                 Krishnaswami and Ryan R. Newton",
  title =        "Freeze after writing: quasi-deterministic parallel
                 programming with {LVars}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "257--270",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535842",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Deterministic-by-construction parallel programming
                 models offer the advantages of parallel speedup while
                 avoiding the nondeterministic, hard-to-reproduce bugs
                 that plague fully concurrent code. A principled
                 approach to deterministic-by-construction parallel
                 programming with shared state is offered by LVars:
                 shared memory locations whose semantics are defined in
                 terms of an application-specific lattice. Writes to an
                 LVar take the least upper bound of the old and new
                 values with respect to the lattice, while reads from an
                 LVar can observe only that its contents have crossed a
                 specified threshold in the lattice. Although it
                 guarantees determinism, this interface is quite
                 limited. We extend LVars in two ways. First, we add the
                 ability to ``freeze'' and then read the contents of an
                 LVar directly. Second, we add the ability to attach
                 event handlers to an LVar, triggering a callback when
                 the LVar's value changes. Together, handlers and
                 freezing enable an expressive and useful style of
                 parallel programming. We prove that in a language where
                 communication takes place through these extended LVars,
                 programs are at worst quasi-deterministic: on every
                 run, they either produce the same answer or raise an
                 error. We demonstrate the viability of our approach by
                 implementing a library for Haskell supporting a variety
                 of LVar-based data structures, together with a case
                 study that illustrates the programming model and yields
                 promising parallel speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Burckhardt:2014:RDT,
  author =       "Sebastian Burckhardt and Alexey Gotsman and Hongseok
                 Yang and Marek Zawirski",
  title =        "Replicated data types: specification, verification,
                 optimality",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "271--284",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535848",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Geographically distributed systems often rely on
                 replicated eventually consistent data stores to achieve
                 availability and performance. To resolve conflicting
                 updates at different replicas, researchers and
                 practitioners have proposed specialized consistency
                 protocols, called replicated data types, that implement
                 objects such as registers, counters, sets or lists.
                 Reasoning about replicated data types has however not
                 been on par with comparable work on abstract data types
                 and concurrent data types, lacking specifications,
                 correctness proofs, and optimality results. To fill in
                 this gap, we propose a framework for specifying
                 replicated data types using relations over events and
                 verifying their implementations using replication-aware
                 simulations. We apply it to 7 existing implementations
                 of 4 data types with nontrivial conflict-resolution
                 strategies and optimizations (last-writer-wins
                 register, counter, multi-value register and
                 observed-remove set). We also present a novel technique
                 for obtaining lower bounds on the worst-case space
                 overhead of data type implementations and use it to
                 prove optimality of 4 implementations. Finally, we show
                 how to specify consistency of replicated stores with
                 multiple objects axiomatically, in analogy to prior
                 work on weak memory models. Overall, our work provides
                 foundational reasoning tools to support research on
                 replicated eventually consistent stores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bouajjani:2014:VEC,
  author =       "Ahmed Bouajjani and Constantin Enea and Jad Hamza",
  title =        "Verifying eventual consistency of optimistic
                 replication systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "285--296",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535877",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We address the verification problem of eventual
                 consistency of optimistic replication systems. Such
                 systems are typically used to implement distributed
                 data structures over large scale networks. We introduce
                 a formal definition of eventual consistency that
                 applies to a wide class of existing implementations,
                 including the ones using speculative executions. Then,
                 we reduce the problem of checking eventual consistency
                 to reachability and model checking problems. This
                 reduction enables the use of existing verification
                 tools for message-passing programs in the context of
                 verifying optimistic replication systems. Furthermore,
                 we derive from these reductions decision procedures for
                 checking eventual consistency of systems implemented as
                 finite-state programs communicating through unbounded
                 unordered channels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{DalLago:2014:CEH,
  author =       "Ugo {Dal Lago} and Davide Sangiorgi and Michele
                 Alberti",
  title =        "On coinductive equivalences for higher-order
                 probabilistic functional programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "297--308",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535872",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We study bisimulation and context equivalence in a
                 probabilistic lambda-calculus. The contributions of
                 this paper are threefold. Firstly we show a technique
                 for proving congruence of probabilistic applicative
                 bisimilarity. While the technique follows Howe's
                 method, some of the technicalities are quite different,
                 relying on non-trivial ``disentangling'' properties for
                 sets of real numbers. Secondly we show that, while
                 bisimilarity is in general strictly finer than context
                 equivalence, coincidence between the two relations is
                 attained on pure lambda-terms. The resulting equality
                 is that induced by Levy--Longo trees, generally
                 accepted as the finest extensional equivalence on pure
                 lambda-terms under a lazy regime. Finally, we derive a
                 coinductive characterisation of context equivalence on
                 the whole probabilistic language, via an extension in
                 which terms akin to distributions may appear in redex
                 position. Another motivation for the extension is that
                 its operational semantics allows us to experiment with
                 a different congruence technique, namely that of
                 logical bisimilarity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ehrhard:2014:PCS,
  author =       "Thomas Ehrhard and Christine Tasson and Michele
                 Pagani",
  title =        "Probabilistic coherence spaces are fully abstract for
                 probabilistic {PCF}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "309--320",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535865",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Probabilistic coherence spaces (PCoh) yield a
                 semantics of higher-order probabilistic computation,
                 interpreting types as convex sets and programs as power
                 series. We prove that the equality of interpretations
                 in Pcoh characterizes the operational
                 indistinguishability of programs in PCF with a random
                 primitive. This is the first result of full abstraction
                 for a semantics of probabilistic PCF. The key
                 ingredient relies on the regularity of power series.
                 Along the way to the theorem, we design a weighted
                 intersection type assignment system giving a logical
                 presentation of PCoh.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gordon:2014:TSD,
  author =       "Andrew D. Gordon and Thore Graepel and Nicolas Rolland
                 and Claudio Russo and Johannes Borgstrom and John
                 Guiver",
  title =        "{Tabular}: a schema-driven probabilistic programming
                 language",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "321--334",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535850",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We propose a new kind of probabilistic programming
                 language for machine learning. We write programs simply
                 by annotating existing relational schemas with
                 probabilistic model expressions. We describe a detailed
                 design of our language, Tabular, complete with formal
                 semantics and type system. A rich series of examples
                 illustrates the expressiveness of Tabular. We report an
                 implementation, and show evidence of the succinctness
                 of our notation relative to current best practice.
                 Finally, we describe and verify a transformation of
                 Tabular schemas so as to predict missing values in a
                 concrete database. The ability to query for missing
                 values provides a uniform interface to a wide variety
                 of tasks, including classification, clustering,
                 recommendation, and ranking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sergey:2014:MHO,
  author =       "Ilya Sergey and Dimitrios Vytiniotis and Simon Peyton
                 Jones",
  title =        "Modular, higher-order cardinality analysis in theory
                 and practice",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "335--347",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535861",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Since the mid '80s, compiler writers for functional
                 languages (especially lazy ones) have been writing
                 papers about identifying and exploiting thunks and
                 lambdas that are used only once. However it has proved
                 difficult to achieve both power and simplicity in
                 practice. We describe a new, modular analysis for a
                 higher-order language, which is both simple and
                 effective, and present measurements of its use in a
                 full-scale, state of the art optimising compiler. The
                 analysis finds many single-entry thunks and one-shot
                 lambdas and enables a number of program
                 optimisations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chang:2014:PL,
  author =       "Stephen Chang and Matthias Felleisen",
  title =        "Profiling for laziness",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "349--360",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535887",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "While many programmers appreciate the benefits of lazy
                 programming at an abstract level, determining which
                 parts of a concrete program to evaluate lazily poses a
                 significant challenge for most of them. Over the past
                 thirty years, experts have published numerous papers on
                 the problem, but developing this level of expertise
                 requires a significant amount of experience. We present
                 a profiling-based technique that captures and automates
                 this expertise for the insertion of laziness
                 annotations into strict programs. To make this idea
                 precise, we show how to equip a formal semantics with a
                 metric that measures waste in an evaluation. Then we
                 explain how to implement this metric as a dynamic
                 profiling tool that suggests where to insert laziness
                 into a program. Finally, we present evidence that our
                 profiler's suggestions either match or improve on an
                 expert's use of laziness in a range of real-world
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cave:2014:FRP,
  author =       "Andrew Cave and Francisco Ferreira and Prakash
                 Panangaden and Brigitte Pientka",
  title =        "Fair reactive programming",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "361--372",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535881",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Functional Reactive Programming (FRP) models reactive
                 systems with events and signals, which have previously
                 been observed to correspond to the ``eventually'' and
                 ``always'' modalities of linear temporal logic (LTL).
                 In this paper, we define a constructive variant of LTL
                 with least fixed point and greatest fixed point
                 operators in the spirit of the modal mu-calculus, and
                 give it a proofs-as-programs interpretation as a
                 foundational calculus for reactive programs. Previous
                 work emphasized the propositions-as-types part of the
                 correspondence between LTL and FRP; here we emphasize
                 the proofs-as-programs part by employing structural
                 proof theory. We show that the type system is
                 expressive enough to enforce liveness properties such
                 as the fairness of schedulers and the eventual delivery
                 of results. We illustrate programming in this calculus
                 using (co)iteration operators. We prove type
                 preservation of our operational semantics, which
                 guarantees that our programs are causal. We give also a
                 proof of strong normalization which provides
                 justification that our programs are productive and that
                 they satisfy liveness properties derived from their
                 types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Abdulla:2014:ODP,
  author =       "Parosh Abdulla and Stavros Aronis and Bengt Jonsson
                 and Konstantinos Sagonas",
  title =        "Optimal dynamic partial order reduction",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "373--384",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535845",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Stateless model checking is a powerful technique for
                 program verification, which however suffers from an
                 exponential growth in the number of explored
                 executions. A successful technique for reducing this
                 number, while still maintaining complete coverage, is
                 Dynamic Partial Order Reduction (DPOR). We present a
                 new DPOR algorithm, which is the first to be provably
                 optimal in that it always explores the minimal number
                 of executions. It is based on a novel class of sets,
                 called source sets, which replace the role of
                 persistent sets in previous algorithms. First, we show
                 how to modify an existing DPOR algorithm to work with
                 source sets, resulting in an efficient and simple to
                 implement algorithm. Second, we extend this algorithm
                 with a novel mechanism, called wakeup trees, that
                 allows to achieve optimality. We have implemented both
                 algorithms in a stateless model checking tool for
                 Erlang programs. Experiments show that source sets
                 significantly increase the performance and that wakeup
                 trees incur only a small overhead in both time and
                 space.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Itzhaky:2014:MRA,
  author =       "Shachar Itzhaky and Anindya Banerjee and Neil Immerman
                 and Ori Lahav and Aleksandar Nanevski and Mooly Sagiv",
  title =        "Modular reasoning about heap paths via effectively
                 propositional formulas",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "385--396",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535854",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "First order logic with transitive closure, and
                 separation logic enable elegant interactive
                 verification of heap-manipulating programs. However,
                 undecidabilty results and high asymptotic complexity of
                 checking validity preclude complete automatic
                 verification of such programs, even when loop
                 invariants and procedure contracts are specified as
                 formulas in these logics. This paper tackles the
                 problem of procedure-modular verification of
                 reachability properties of heap-manipulating programs
                 using efficient decision procedures that are complete:
                 that is, a SAT solver must generate a counterexample
                 whenever a program does not satisfy its specification.
                 By (a) requiring each procedure modifies a fixed set of
                 heap partitions and creates a bounded amount of heap
                 sharing, and (b) restricting program contracts and loop
                 invariants to use only deterministic paths in the heap,
                 we show that heap reachability updates can be described
                 in a simple manner. The restrictions force program
                 specifications and verification conditions to lie
                 within a fragment of first-order logic with transitive
                 closure that is reducible to effectively propositional
                 logic, and hence facilitate sound, complete and
                 efficient verification. We implemented a tool atop Z3
                 and report on preliminary experiments that establish
                 the correctness of several programs that manipulate
                 linked data structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chong:2014:SCA,
  author =       "Nathan Chong and Alastair F. Donaldson and Jeroen
                 Ketema",
  title =        "A sound and complete abstraction for reasoning about
                 parallel prefix sums",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "397--409",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535882",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Prefix sums are key building blocks in the
                 implementation of many concurrent software
                 applications, and recently much work has gone into
                 efficiently implementing prefix sums to run on
                 massively parallel graphics processing units (GPUs).
                 Because they lie at the heart of many GPU-accelerated
                 applications, the correctness of prefix sum
                 implementations is of prime importance. We introduce a
                 novel abstraction, the interval of summations, that
                 allows scalable reasoning about implementations of
                 prefix sums. We present this abstraction as a monoid,
                 and prove a soundness and completeness result showing
                 that a generic sequential prefix sum implementation is
                 correct for an array of length $n$ if and only if it
                 computes the correct result for a specific test case
                 when instantiated with the interval of summations
                 monoid. This allows correctness to be established by
                 running a single test where the input and result
                 require O(n lg(n)) space. This improves upon an
                 existing result by Sheeran where the input requires O(n
                 lg(n)) space and the result O(n$^2$ \lg(n)) space, and
                 is more feasible for large n than a method by
                 Voigtlaender that uses O(n) space for the input and
                 result but requires running O(n$^2$ ) tests. We then
                 extend our abstraction and results to the context of
                 data-parallel programs, developing an automated
                 verification method for GPU implementations of prefix
                 sums. Our method uses static verification to prove that
                 a generic prefix sum implementation is data race-free,
                 after which functional correctness of the
                 implementation can be determined by running a single
                 test case under the interval of summations abstraction.
                 We present an experimental evaluation using four
                 different prefix sum algorithms, showing that our
                 method is highly automatic, scales to large thread
                 counts, and significantly outperforms Voigtlaender's
                 method when applied to large arrays.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Miller:2014:ADS,
  author =       "Andrew Miller and Michael Hicks and Jonathan Katz and
                 Elaine Shi",
  title =        "Authenticated data structures, generically",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "411--423",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535851",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "An authenticated data structure (ADS) is a data
                 structure whose operations can be carried out by an
                 untrusted prover, the results of which a verifier can
                 efficiently check as authentic. This is done by having
                 the prover produce a compact proof that the verifier
                 can check along with each operation's result. ADSs thus
                 support outsourcing data maintenance and processing
                 tasks to untrusted servers without loss of integrity.
                 Past work on ADSs has focused on particular data
                 structures (or limited classes of data structures), one
                 at a time, often with support only for particular
                 operations. This paper presents a generic method, using
                 a simple extension to a ML-like functional programming
                 language we call \lambda o (lambda-auth), with which
                 one can program authenticated operations over any data
                 structure defined by standard type constructors,
                 including recursive types, sums, and products. The
                 programmer writes the data structure largely as usual
                 and it is compiled to code to be run by the prover and
                 verifier. Using a formalization of \lambda o we prove
                 that all well-typed \lambda o programs result in code
                 that is secure under the standard cryptographic
                 assumption of collision-resistant hash functions. We
                 have implemented \lambda o as an extension to the OCaml
                 compiler, and have used it to produce authenticated
                 versions of many interesting data structures including
                 binary search trees, red-black+ trees, skip lists, and
                 more. Performance experiments show that our approach is
                 efficient, giving up little compared to the
                 hand-optimized data structures developed previously.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Swamy:2014:GTE,
  author =       "Nikhil Swamy and Cedric Fournet and Aseem Rastogi and
                 Karthikeyan Bhargavan and Juan Chen and Pierre-Yves
                 Strub and Gavin Bierman",
  title =        "Gradual typing embedded securely in {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "425--437",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535889",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "JavaScript's flexible semantics makes writing correct
                 code hard and writing secure code extremely difficult.
                 To address the former problem, various forms of gradual
                 typing have been proposed, such as Closure and
                 TypeScript. However, supporting all common programming
                 idioms is not easy; for example, TypeScript
                 deliberately gives up type soundness for programming
                 convenience. In this paper, we propose a gradual type
                 system and implementation techniques that provide
                 important safety and security guarantees. We present
                 TS\# , a gradual type system and source-to-source
                 compiler for JavaScript. In contrast to prior gradual
                 type systems, TS\# features full runtime reflection
                 over three kinds of types: (1) simple types for
                 higher-order functions, recursive datatypes and
                 dictionary-based extensible records; (2) the type any,
                 for dynamically type-safe TS\# expressions; and (3) the
                 type un, for untrusted, potentially malicious
                 JavaScript contexts in which TS\# is embedded. After
                 type-checking, the compiler instruments the program
                 with various checks to ensure the type safety of TS\#
                 despite its interactions with arbitrary JavaScript
                 contexts, which are free to use eval, stack walks,
                 prototype customizations, and other offensive features.
                 The proof of our main theorem employs a form of
                 type-preserving compilation, wherein we prove all the
                 runtime invariants of the translation of TS\# to
                 JavaScript by showing that translated programs are
                 well-typed in JS\# , a previously proposed dependently
                 typed language for proving functional correctness of
                 JavaScript programs. We describe a prototype compiler,
                 a secure runtime, and sample applications for TS\#. Our
                 examples illustrate how web security patterns that
                 developers currently program in JavaScript (with much
                 difficulty and still with dubious results) can instead
                 be programmed naturally in TS\#, retaining a flavor of
                 idiomatic JavaScript, while providing strong safety
                 guarantees by virtue of typing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Long:2014:SIF,
  author =       "Fan Long and Stelios Sidiroglou-Douskos and Deokhwan
                 Kim and Martin Rinard",
  title =        "Sound input filter generation for integer overflow
                 errors",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "439--452",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535888",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We present a system, SIFT, for generating input
                 filters that nullify integer overflow errors associated
                 with critical program sites such as memory allocation
                 or block copy sites. SIFT uses a static program
                 analysis to generate filters that discard inputs that
                 may trigger integer overflow errors in the computations
                 of the sizes of allocated memory blocks or the number
                 of copied bytes in block copy operations. Unlike all
                 previous techniques of which we are aware, SIFT is
                 sound --- if an input passes the filter, it will not
                 trigger an integer overflow error at any analyzed site.
                 Our results show that SIFT successfully analyzes (and
                 therefore generates sound input filters for) 56 out of
                 58 memory allocation and block memory copy sites in
                 analyzed input processing modules from five
                 applications (VLC, Dillo, Swfdec, Swftools, and GIMP).
                 These nullified errors include six known integer
                 overflow vulnerabilities. Our results also show that
                 applying these filters to 62895 real-world inputs
                 produces no false positives. The analysis and filter
                 generation times are all less than a second.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Brotherston:2014:PCS,
  author =       "James Brotherston and Jules Villard",
  title =        "Parametric completeness for separation theories",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "453--464",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535844",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "In this paper, we close the logical gap between
                 provability in the logic BBI, which is the
                 propositional basis for separation logic, and validity
                 in an intended class of separation models, as employed
                 in applications of separation logic such as program
                 verification. An intended class of separation models is
                 usually specified by a collection of axioms describing
                 the specific model properties that are expected to
                 hold, which we call a separation theory. Our main
                 contributions are as follows. First, we show that
                 several typical properties of separation theories are
                 not definable in BBI. Second, we show that these
                 properties become definable in a suitable hybrid
                 extension of BBI, obtained by adding a theory of naming
                 to BBI in the same way that hybrid logic extends normal
                 modal logic. The binder-free extension captures most of
                 the properties we consider, and the full extension
                 HyBBI(V) with the usual V binder of hybrid logic covers
                 all these properties. Third, we present an axiomatic
                 proof system for our hybrid logic whose extension with
                 any set of ``pure'' axioms is sound and complete with
                 respect to the models satisfying those axioms. As a
                 corollary of this general result, we obtain, in a
                 parametric manner, a sound and complete axiomatic proof
                 system for any separation theory from our considered
                 class. To the best of our knowledge, this class
                 includes all separation theories appearing in the
                 published literature.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hou:2014:PSP,
  author =       "Zh{\'e} H{\'o}u and Ranald Clouston and Rajeev
                 Gor{\'e} and Alwen Tiu",
  title =        "Proof search for propositional abstract separation
                 logics via labelled sequents",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "465--476",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535864",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Abstract separation logics are a family of extensions
                 of Hoare logic for reasoning about programs that mutate
                 memory. These logics are ``abstract'' because they are
                 independent of any particular concrete memory model.
                 Their assertion languages, called propositional
                 abstract separation logics, extend the logic of
                 (Boolean) Bunched Implications (BBI) in various ways.
                 We develop a modular proof theory for various
                 propositional abstract separation logics using cut-free
                 labelled sequent calculi. We first extend the cut-fee
                 labelled sequent calculus for BBI of Hou et al to
                 handle Calcagno et al's original logic of separation
                 algebras by adding sound rules for partial-determinism
                 and cancellativity, while preserving cut-elimination.
                 We prove the completeness of our calculus via a sound
                 intermediate calculus that enables us to construct
                 counter-models from the failure to find a proof. We
                 then capture other propositional abstract separation
                 logics by adding sound rules for indivisible unit and
                 disjointness, while maintaining completeness and
                 cut-elimination. We present a theorem prover based on
                 our labelled calculus for these logics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lee:2014:PSS,
  author =       "Wonyeol Lee and Sungwoo Park",
  title =        "A proof system for separation logic with magic wand",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "477--490",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535871",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Separation logic is an extension of Hoare logic which
                 is acknowledged as an enabling technology for
                 large-scale program verification. It features two new
                 logical connectives, separating conjunction and
                 separating implication, but most of the applications of
                 separation logic have exploited only separating
                 conjunction without considering separating implication.
                 Nevertheless the power of separating implication has
                 been well recognized and there is a growing interest in
                 its use for program verification. This paper develops a
                 proof system for full separation logic which supports
                 not only separating conjunction but also separating
                 implication. The proof system is developed in the style
                 of sequent calculus and satisfies the admissibility of
                 cut. The key challenge in the development is to devise
                 a set of inference rules for manipulating heap
                 structures that ensure the completeness of the proof
                 system with respect to separation logic. We show that
                 our proof of completeness directly translates to a
                 proof search strategy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Atkey:2014:PCL,
  author =       "Robert Atkey",
  title =        "From parametricity to conservation laws, via
                 {Noether}'s theorem",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "491--502",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535867",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Invariance is of paramount importance in programming
                 languages and in physics. In programming languages,
                 John Reynolds' theory of relational parametricity
                 demonstrates that parametric polymorphic programs are
                 invariant under change of data representation, a
                 property that yields ``free'' theorems about programs
                 just from their types. In physics, Emmy Noether showed
                 that if the action of a physical system is invariant
                 under change of coordinates, then the physical system
                 has a conserved quantity: a quantity that remains
                 constant for all time. Knowledge of conserved
                 quantities can reveal deep properties of physical
                 systems. For example, the conservation of energy is by
                 Noether's theorem a consequence of a system's
                 invariance under time-shifting. In this paper, we link
                 Reynolds' relational parametricity with Noether's
                 theorem for deriving conserved quantities. We propose
                 an extension of System F$ \omega $ with new kinds,
                 types and term constants for writing programs that
                 describe classical mechanical systems in terms of their
                 Lagrangians. We show, by constructing a relationally
                 parametric model of our extension of F$ \omega $, that
                 relational parametricity is enough to satisfy the
                 hypotheses of Noether's theorem, and so to derive
                 conserved quantities for free, directly from the
                 polymorphic types of Lagrangians expressed in our
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Atkey:2014:RPM,
  author =       "Robert Atkey and Neil Ghani and Patricia Johann",
  title =        "A relationally parametric model of dependent type
                 theory",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "503--515",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535852",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Reynolds' theory of relational parametricity captures
                 the invariance of polymorphically typed programs under
                 change of data representation. Reynolds' original work
                 exploited the typing discipline of the polymorphically
                 typed lambda-calculus System F, but there is now
                 considerable interest in extending relational
                 parametricity to type systems that are richer and more
                 expressive than that of System F. This paper constructs
                 parametric models of predicative and impredicative
                 dependent type theory. The significance of our models
                 is twofold. Firstly, in the impredicative variant we
                 are able to deduce the existence of initial algebras
                 for all indexed= functors. To our knowledge, ours is
                 the first account of parametricity for dependent types
                 that is able to lift the useful deduction of the
                 existence of initial algebras in parametric models of
                 System F to the dependently typed setting. Secondly,
                 our models offer conceptual clarity by uniformly
                 expressing relational parametricity for dependent types
                 in terms of reflexive graphs, which allows us to unify
                 the interpretations of types and kinds, instead of
                 taking the relational interpretation of types as a
                 primitive notion. Expressing our model in terms of
                 reflexive graphs ensures that it has canonical choices
                 for the interpretations of the standard type
                 constructors of dependent type theory, except for the
                 interpretation of the universe of small types, where we
                 formulate a refined interpretation tailored for
                 relational parametricity. Moreover, our reflexive graph
                 model opens the door to generalisations of relational
                 parametricity, for example to higher-dimensional
                 relational parametricity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Murawski:2014:GSI,
  author =       "Andrzej S. Murawski and Nikos Tzevelekos",
  title =        "Game semantics for interface middleweight {Java}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "517--528",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535880",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We consider an object calculus in which open terms
                 interact with the environment through interfaces. The
                 calculus is intended to capture the essence of
                 contextual interactions of Middleweight Java code.
                 Using game semantics, we provide fully abstract models
                 for the induced notions of contextual approximation and
                 equivalence. These are the first denotational models of
                 this kind.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Jeannet:2014:AAG,
  author =       "Bertrand Jeannet and Peter Schrammel and Sriram
                 Sankaranarayanan",
  title =        "Abstract acceleration of general linear loops",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "529--540",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535843",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We present abstract acceleration techniques for
                 computing loop invariants for numerical programs with
                 linear assignments and conditionals. Whereas abstract
                 interpretation techniques typically over-approximate
                 the set of reachable states iteratively, abstract
                 acceleration captures the effect of the loop with a
                 single, non-iterative transfer function applied to the
                 initial states at the loop head. In contrast to
                 previous acceleration techniques, our approach applies
                 to any linear loop without restrictions. Its novelty
                 lies in the use of the Jordan normal form decomposition
                 of the loop body to derive symbolic expressions for the
                 entries of the matrix modeling the effect of $\eta
                 \geq \Omicron$ iterations of the loop. The entries of
                 such a matrix depend on \eta through complex
                 polynomial, exponential and trigonometric functions.
                 Therefore, we introduces an abstract domain for
                 matrices that captures the linear inequality relations
                 between these complex expressions. This results in an
                 abstract matrix for describing the fixpoint semantics
                 of the loop. Our approach integrates smoothly into
                 standard abstract interpreters and can handle programs
                 with nested loops and loops containing conditional
                 branches. We evaluate it over small but complex loops
                 that are commonly found in control software, comparing
                 it with other tools for computing linear loop
                 invariants. The loops in our benchmarks typically
                 exhibit polynomial, exponential and oscillatory
                 behaviors that present challenges to existing
                 approaches. Our approach finds non-trivial invariants
                 to prove useful bounds on the values of variables for
                 such loops, clearly outperforming the existing
                 approaches in terms of precision while exhibiting good
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{DAntoni:2014:MSA,
  author =       "Loris D'Antoni and Margus Veanes",
  title =        "Minimization of symbolic automata",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "541--553",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535849",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Symbolic Automata extend classical automata by using
                 symbolic alphabets instead of finite ones. Most of the
                 classical automata algorithms rely on the alphabet
                 being finite, and generalizing them to the symbolic
                 setting is not a trivial task. In this paper we study
                 the problem of minimizing symbolic automata. We
                 formally define and prove the basic properties of
                 minimality in the symbolic setting, and lift classical
                 minimization algorithms (Huffman-Moore's and Hopcroft's
                 algorithms) to symbolic automata. While Hopcroft's
                 algorithm is the fastest known algorithm for DFA
                 minimization, we show how, in the presence of symbolic
                 alphabets, it can incur an exponential blowup. To
                 address this issue, we introduce a new algorithm that
                 fully benefits from the symbolic representation of the
                 alphabet and does not suffer from the exponential
                 blowup. We provide comprehensive performance evaluation
                 of all the algorithms over large benchmarks and against
                 existing state-of-the-art implementations. The
                 experiments show how the new symbolic algorithm is
                 faster than previous implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chaudhuri:2014:CAD,
  author =       "Swarat Chaudhuri and Azadeh Farzan and Zachary
                 Kincaid",
  title =        "Consistency analysis of decision-making programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "555--567",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535858",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Applications in many areas of computing make discrete
                 decisions under uncertainty, for reasons such as
                 limited numerical precision in calculations and errors
                 in sensor-derived inputs. As a result, individual
                 decisions made by such programs may be
                 nondeterministic, and lead to contradictory decisions
                 at different points of an execution. This means that an
                 otherwise correct program may execute along paths, that
                 it would not follow under its ideal semantics,
                 violating essential program invariants on the way. A
                 program is said to be consistent if it does not suffer
                 from this problem despite uncertainty in decisions. In
                 this paper, we present a sound, automatic program
                 analysis for verifying that a program is consistent in
                 this sense. Our analysis proves that each decision made
                 along a program execution is consistent with the
                 decisions made earlier in the execution. The proof is
                 done by generating an invariant that abstracts the set
                 of all decisions made along executions that end at a
                 program location l, then verifying, using a fixpoint
                 constraint-solver, that no contradiction can be derived
                 when these decisions are combined with new decisions
                 made at l. We evaluate our analysis on a collection of
                 programs implementing algorithms in computational
                 geometry. Consistency is known to be a critical,
                 frequently-violated, and thoroughly studied correctness
                 property in geometry, but ours is the first attempt at
                 automated verification of consistency of geometric
                 algorithms. Our benchmark suite consists of
                 implementations of convex hull computation,
                 triangulation, and point location algorithms. On almost
                 all examples that are not consistent (with two
                 exceptions), our analysis is able to verify consistency
                 within a few minutes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhang:2014:TGD,
  author =       "Danfeng Zhang and Andrew C. Myers",
  title =        "Toward general diagnosis of static errors",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "569--581",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535870",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We introduce a general way to locate programmer
                 mistakes that are detected by static analyses such as
                 type checking. The program analysis is expressed in a
                 constraint language in which mistakes result in
                 unsatisfiable constraints. Given an unsatisfiable
                 system of constraints, both satisfiable and
                 unsatisfiable constraints are analyzed, to identify the
                 program expressions most likely to be the cause of
                 unsatisfiability. The likelihood of different error
                 explanations is evaluated under the assumption that the
                 programmer's code is mostly correct, so the simplest
                 explanations are chosen, following Bayesian principles.
                 For analyses that rely on programmer-stated
                 assumptions, the diagnosis also identifies assumptions
                 likely to have been omitted. The new error diagnosis
                 approach has been implemented for two very different
                 program analyses: type inference in OCaml and
                 information flow checking in Jif. The effectiveness of
                 the approach is evaluated using previously collected
                 programs containing errors. The results show that when
                 compared to existing compilers and other tools, the
                 general technique identifies the location of programmer
                 errors significantly more accurately.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2014:CFT,
  author =       "Sheng Chen and Martin Erwig",
  title =        "Counter-factual typing for debugging type errors",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "583--594",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535863",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Changing a program in response to a type error plays
                 an important part in modern software development.
                 However, the generation of good type error messages
                 remains a problem for highly expressive type systems.
                 Existing approaches often suffer from a lack of
                 precision in locating errors and proposing remedies.
                 Specifically, they either fail to locate the source of
                 the type error consistently, or they report too many
                 potential error locations. Moreover, the change
                 suggestions offered are often incorrect. This makes the
                 debugging process tedious and ineffective. We present
                 an approach to the problem of type debugging that is
                 based on generating and filtering a comprehensive set
                 of type-change suggestions. Specifically, we generate
                 all (program-structure-preserving) type changes that
                 can possibly fix the type error. These suggestions will
                 be ranked and presented to the programmer in an
                 iterative fashion. In some cases we also produce
                 suggestions to change the program. In most situations,
                 this strategy delivers the correct change suggestions
                 quickly, and at the same time never misses any rare
                 suggestions. The computation of the potentially huge
                 set of type-change suggestions is efficient since it is
                 based on a variational type inference algorithm that
                 type checks a program with variations only once,
                 efficiently reusing type information for shared parts.
                 We have evaluated our method and compared it with
                 previous approaches. Based on a large set of examples
                 drawn from the literature, we have found that our
                 method outperforms other approaches and provides a
                 viable alternative.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Boker:2014:BTS,
  author =       "Udi Boker and Thomas A. Henzinger and Arjun
                 Radhakrishna",
  title =        "Battery transition systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "595--606",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535875",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "The analysis of the energy consumption of software is
                 an important goal for quantitative formal methods.
                 Current methods, using weighted transition systems or
                 energy games, model the energy source as an ideal
                 resource whose status is characterized by one number,
                 namely the amount of remaining energy. Real batteries,
                 however, exhibit behaviors that can deviate
                 substantially from an ideal energy resource. Based on a
                 discretization of a standard continuous battery model,
                 we introduce {\em battery transition systems}. In this
                 model, a battery is viewed as consisting of two parts
                 --- the available-charge tank and the bound-charge
                 tank. Any charge or discharge is applied to the
                 available-charge tank. Over time, the energy from each
                 tank diffuses to the other tank. Battery transition
                 systems are infinite state systems that, being not
                 well-structured, fall into no decidable class that is
                 known to us. Nonetheless, we are able to prove that the
                 $ \omega $-regular model-checking problem is decidable
                 for battery transition systems. We also present a case
                 study on the verification of control programs for
                 energy-constrained semi-autonomous robots.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Li:2014:SOS,
  author =       "Yi Li and Aws Albarghouthi and Zachary Kincaid and
                 Arie Gurfinkel and Marsha Chechik",
  title =        "Symbolic optimization with {SMT} solvers",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "607--618",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535857",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "The rise in efficiency of Satisfiability Modulo
                 Theories (SMT) solvers has created numerous uses for
                 them in software verification, program synthesis,
                 functional programming, refinement types, etc. In all
                 of these applications, SMT solvers are used for
                 generating satisfying assignments (e.g., a witness for
                 a bug) or proving unsatisfiability/validity(e.g.,
                 proving that a subtyping relation holds). We are often
                 interested in finding not just an arbitrary satisfying
                 assignment, but one that optimizes
                 (minimizes/maximizes) certain criteria. For example, we
                 might be interested in detecting program executions
                 that maximize energy usage (performance bugs), or
                 synthesizing short programs that do not make expensive
                 API calls. Unfortunately, none of the available SMT
                 solvers offer such optimization capabilities. In this
                 paper, we present SYMBA, an efficient SMT-based
                 optimization algorithm for objective functions in the
                 theory of linear real arithmetic (LRA). Given a formula
                 \phi and an objective function t, SYMBA finds a
                 satisfying assignment of \phi that maximizes the value
                 of t. SYMBA utilizes efficient SMT solvers as black
                 boxes. As a result, it is easy to implement and it
                 directly benefits from future advances in SMT solvers.
                 Moreover, SYMBA can optimize a set of objective
                 functions, reusing information between them to speed up
                 the analysis. We have implemented SYMBA and evaluated
                 it on a large number of optimization benchmarks drawn
                 from program analysis tasks. Our results indicate the
                 power and efficiency of SYMBA in comparison with
                 competing approaches, and highlight the importance of
                 its multi-objective-function feature.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Benton:2014:AEP,
  author =       "Nick Benton and Martin Hofmann and Vivek Nigam",
  title =        "Abstract effects and proof-relevant logical
                 relations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "619--631",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535869",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We give a denotational semantics for a region-based
                 effect system that supports type abstraction in the
                 sense that only externally visible effects need to be
                 tracked: non-observable internal modifications, such as
                 the reorganisation of a search tree or lazy
                 initialisation, can count as 'pure' or 'read only'.
                 This 'fictional purity' allows clients of a module to
                 validate soundly more effect-based program equivalences
                 than would be possible with previous semantics. Our
                 semantics uses a novel variant of logical relations
                 that maps types not merely to partial equivalence
                 relations on values, as is commonly done, but rather to
                 a proof-relevant generalisation thereof, namely
                 setoids. The objects of a setoid establish that values
                 inhabit semantic types, whilst its morphisms are
                 understood as proofs of semantic equivalence. The
                 transition to proof-relevance solves two awkward
                 problems caused by na{\"\i}ve use of existential
                 quantification in Kripke logical relations, namely
                 failure of admissibility and spurious functional
                 dependencies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Katsumata:2014:PEM,
  author =       "Shin-ya Katsumata",
  title =        "Parametric effect monads and semantics of effect
                 systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "633--645",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535846",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "We study fundamental properties of a generalisation of
                 monad called parametric effect monad, and apply it to
                 the interpretation of general effect systems whose
                 effects have sequential composition operators. We show
                 that parametric effect monads admit analogues of the
                 structures and concepts that exist for monads, such as
                 Kleisli triples, the state monad and the continuation
                 monad, Plotkin and Power's algebraic operations, and
                 the categorical ++-lifting. We also show a systematic
                 method to generate both effects and a parametric effect
                 monad from a monad morphism. Finally, we introduce two
                 effect systems with explicit and implicit subeffecting,
                 and discuss their denotational semantics and the
                 soundness of effect systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Pagani:2014:AQS,
  author =       "Michele Pagani and Peter Selinger and Beno{\^\i}t
                 Valiron",
  title =        "Applying quantitative semantics to higher-order
                 quantum computing",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "647--658",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535879",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Finding a denotational semantics for higher order
                 quantum computation is a long-standing problem in the
                 semantics of quantum programming languages. Most past
                 approaches to this problem fell short in one way or
                 another, either limiting the language to an unusably
                 small finitary fragment, or giving up important
                 features of quantum physics such as entanglement. In
                 this paper, we propose a denotational semantics for a
                 quantum lambda calculus with recursion and an infinite
                 data type, using constructions from quantitative
                 semantics of linear logic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Accattoli:2014:NST,
  author =       "Beniamino Accattoli and Eduardo Bonelli and Delia
                 Kesner and Carlos Lombardi",
  title =        "A nonstandard standardization theorem",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "659--670",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535886",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Standardization is a fundamental notion for connecting
                 programming languages and rewriting calculi. Since both
                 programming languages and calculi rely on substitution
                 for defining their dynamics, explicit substitutions
                 (ES) help further close the gap between theory and
                 practice. This paper focuses on standardization for the
                 linear substitution calculus, a calculus with ES
                 capable of mimicking reduction in lambda-calculus and
                 linear logic proof-nets. For the latter, proof-nets can
                 be formalized by means of a simple equational theory
                 over the linear substitution calculus. Contrary to
                 other extant calculi with ES, our system can be
                 equipped with a residual theory in the sense of
                 L{\'e}vy, which is used to prove a left-to-right
                 standardization theorem for the calculus with ES but
                 without the equational theory. Such a theorem, however,
                 does not lift from the calculus with ES to proof-nets,
                 because the notion of left-to-right derivation is not
                 preserved by the equational theory. We then relax the
                 notion of left-to-right standard derivation, based on a
                 total order on redexes, to a more liberal notion of
                 standard derivation based on partial orders. Our proofs
                 rely on Gonthier, L{\'e}vy, and Melli{\`e}s' axiomatic
                 theory for standardization. However, we go beyond
                 merely applying their framework, revisiting some of its
                 key concepts: we obtain uniqueness (modulo) of standard
                 derivations in an abstract way and we provide a
                 coinductive characterization of their key abstract
                 notion of external redex. This last point is then used
                 to give a simple proof that linear head reduction --a
                 nondeterministic strategy having a central role in the
                 theory of linear logic-- is standard.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Eisenberg:2014:CTF,
  author =       "Richard A. Eisenberg and Dimitrios Vytiniotis and
                 Simon Peyton Jones and Stephanie Weirich",
  title =        "Closed type families with overlapping equations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "1",
  pages =        "671--683",
  month =        jan,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578855.2535856",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Mar 4 17:04:57 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "POPL '14 conference proceedings.",
  abstract =     "Open, type-level functions are a recent innovation in
                 Haskell that move Haskell towards the expressiveness of
                 dependent types, while retaining the look and feel of a
                 practical programming language. This paper shows how to
                 increase expressiveness still further, by adding closed
                 type functions whose equations may overlap, and may
                 have non-linear patterns over an open type universe.
                 Although practically useful and simple to implement,
                 these features go beyond conventional dependent type
                 theory in some respects, and have a subtle
                 metatheory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lerner:2014:TRT,
  author =       "Benjamin S. Lerner and Joe Gibbs Politz and Arjun Guha
                 and Shriram Krishnamurthi",
  title =        "{TeJaS}: retrofitting type systems for {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "1--16",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508170",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "JavaScript programs vary widely in functionality,
                 complexity, and use, and analyses of these programs
                 must accommodate such variations. Type-based analyses
                 are typically the simplest such analyses, but due to
                 the language's subtle idioms and many
                 application-specific needs --- such as ensuring
                 general-purpose type correctness, security properties,
                 or proper library usage --- we have found that a single
                 type system does not suffice for all purposes. However,
                 these varied uses still share many reusable common
                 elements. In this paper we present TeJaS, a framework
                 for building type systems for JavaScript. TeJaS has
                 been engineered modularly to encourage experimentation.
                 Its initial type environment is reified, to admit easy
                 modeling of the various execution contexts of
                 JavaScript programs, and its type language and typing
                 rules are extensible, to enable variations of the type
                 system to be constructed easily. The paper presents the
                 base TeJaS type system, which performs traditional
                 type-checking for JavaScript. Because JavaScript
                 demands complex types, we explain several design
                 decisions to improve user ergonomics. We then describe
                 TeJaS's modular structure, and illustrate it by
                 reconstructing the essence of a very different type
                 system for JavaScript. Systems built from TeJaS have
                 been applied to several real-world, third-party
                 JavaScript programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Kashyap:2014:TRS,
  author =       "Vineeth Kashyap and John Sarracino and John Wagner and
                 Ben Wiedermann and Ben Hardekopf",
  title =        "Type refinement for static analysis of {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "17--26",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508175",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "Static analysis of JavaScript has proven useful for a
                 variety of purposes, including optimization, error
                 checking, security auditing, program refactoring, and
                 more. We propose a technique called type refinement
                 that can improve the precision of such static analyses
                 for JavaScript without any discernible performance
                 impact. Refinement is a known technique that uses the
                 conditions in branch guards to refine the analysis
                 information propagated along each branch path. The key
                 insight of this paper is to recognize that JavaScript
                 semantics include many implicit conditional checks on
                 types, and that performing type refinement on these
                 implicit checks provides significant benefit for
                 analysis precision. To demonstrate the effectiveness of
                 type refinement, we implement a static analysis tool
                 for reporting potential type-errors in JavaScript
                 programs. We provide an extensive empirical evaluation
                 of type refinement using a benchmark suite containing a
                 variety of JavaScript application domains, ranging from
                 the standard performance benchmark suites (Sunspider
                 and Octane), to open-source JavaScript applications, to
                 machine-generated JavaScript via Emscripten. We show
                 that type refinement can significantly improve analysis
                 precision by up to 86\% without affecting the
                 performance of the analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Allende:2014:CIS,
  author =       "Esteban Allende and Johan Fabry and {\'E}ric Tanter",
  title =        "Cast insertion strategies for gradually-typed
                 objects",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "27--36",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508171",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "Gradual typing enables a smooth and progressive
                 integration of static and dynamic typing. The semantics
                 of a gradually-typed program is given by translation to
                 an intermediate language with casts: runtime type
                 checks that control the boundaries between statically-
                 and dynamically-typed portions of a program. This paper
                 studies the performance of different cast insertion
                 strategies in the context of Gradualtalk, a
                 gradually-typed Smalltalk. We first implement the
                 strategy specified by Siek and Taha, which inserts
                 casts at call sites. We then study the dual approach,
                 which consists in performing casts in callees. Based on
                 the observation that both strategies perform well in
                 different scenarios, we design a hybrid strategy that
                 combines the best of each approach. We evaluate these
                 three strategies using both micro- and
                 macro-benchmarks. We also discuss the impact of these
                 strategies on memory, modularity, and inheritance. The
                 hybrid strategy constitutes a promising cast insertion
                 strategy for adding gradual types to existing
                 dynamically-typed languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Kedlaya:2014:ITS,
  author =       "Madhukar N. Kedlaya and Jared Roesch and Behnam
                 Robatmili and Mehrdad Reshadi and Ben Hardekopf",
  title =        "Improved type specialization for dynamic scripting
                 languages",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "37--48",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508177",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "Type feedback and type inference are two common
                 methods used to optimize dynamic languages such as
                 JavaScript. Each of these methods has its own strengths
                 and weaknesses, and we propose that each can benefit
                 from the other if combined in the right way. We explore
                 the interdependency between these two methods and
                 propose two novel ways to combine them in order to
                 significantly increase their aggregate benefit and
                 decrease their aggregate overhead. In our proposed
                 strategy, an initial type inference pass is applied
                 that can reduce type feedback overhead by enabling more
                 intelligent placement of profiling hooks. This initial
                 type inference pass is novel in the literature. After
                 profiling, a final type inference pass uses the type
                 information from profiling to generate efficient code.
                 While this second pass is not novel, we significantly
                 improve its effectiveness in a novel way by feeding the
                 type inference pass information about the function
                 signature, i.e., the types of the function's arguments
                 for a specific function invocation. Our results show
                 significant speedups when using these low-overhead
                 strategies, ranging from $ 1.2 \times $ to $ 4 \times $
                 over an implementation that does not perform type
                 feedback or type inference based optimizations. Our
                 experiments are carried out across a wide range of
                 traditional benchmarks and realistic web applications.
                 The results also show an average reduction of 23.5\% in
                 the size of the profiled data for these benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Keil:2014:EDA,
  author =       "Matthias Keil and Peter Thiemann",
  title =        "Efficient dynamic access analysis using {JavaScript}
                 proxies",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "49--60",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508176",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "JSConTest introduced the notions of effect monitoring
                 and dynamic effect inference for JavaScript. It enables
                 the description of effects with path specifications
                 resembling regular expressions. It is implemented by an
                 offline source code transformation. To overcome the
                 limitations of the JSConTest implementation, we
                 redesigned and reimplemented effect monitoring by
                 taking advantage of JavaScript proxies. Our new design
                 avoids all drawbacks of the prior implementation. It
                 guarantees full interposition; it is not restricted to
                 a subset of JavaScript; it is self-maintaining; and its
                 scalability to large programs is significantly better
                 than with JSConTest. The improved scalability has two
                 sources. First, the reimplementation is significantly
                 faster than the original, transformation-based
                 implementation. Second, the reimplementation relies on
                 the fly-weight pattern and on trace reduction to
                 conserve memory. Only the combination of these
                 techniques enables monitoring and inference for large
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Weiher:2014:PIU,
  author =       "Marcel Weiher and Robert Hirschfeld",
  title =        "Polymorphic identifiers: uniform resource access in
                 {Objective-Smalltalk}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "61--72",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508169",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "In object-oriented programming, polymorphic dispatch
                 of operations decouples clients from specific providers
                 of services and allows implementations to be modified
                 or substituted without affecting clients. The Uniform
                 Access Principle (UAP) tries to extend these qualities
                 to resource access by demanding that access to state be
                 indistinguishable from access to operations. Despite
                 language features supporting the UAP, the overall goal
                 of substitutability has not been achieved for either
                 alternative resources such as keyed storage, files or
                 web pages, or for alternate access mechanisms: specific
                 kinds of resources are bound to specific access
                 mechanisms and vice versa. Changing storage or access
                 patterns either requires changes to both clients and
                 service providers and trying to maintain the UAP
                 imposes significant penalties in terms of
                 code-duplication and/or performance overhead. We
                 propose introducing first class identifiers as
                 polymorphic names for storage locations to solve these
                 problems. With these Polymorphic Identifiers, we show
                 that we can provide uniform access to a wide variety of
                 resource types as well as storage and access
                 mechanisms, whether parametrized or direct, without
                 affecting client code, without causing code duplication
                 or significant performance penalties.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Park:2014:AAS,
  author =       "Changhee Park and Hongki Lee and Sukyoung Ryu",
  title =        "All about the with statement in {JavaScript}: removing
                 with statements in {JavaScript} applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "73--84",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508173",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "The with statement in JavaScript makes static analysis
                 of JavaScript applications difficult by introducing a
                 new scope at run time and thus invalidating lexical
                 scoping. Therefore, many static approaches to
                 JavaScript program analysis and the strict mode of
                 ECMAScript 5 simply disallow the with statement. To
                 justify exclusion of the with statement, we should
                 better understand the actual usage patterns of the with
                 statement. In this paper, we present the usage patterns
                 of the with statement in real-world JavaScript
                 applications currently used in the 898 most popular web
                 sites. We investigate whether we can rewrite the with
                 statements in each pattern to other statements not
                 using the with statement. We show that we can rewrite
                 all the static occurrences of the with statement that
                 do not have any dynamic code generating functions. Even
                 though the rewriting process is not applicable to any
                 dynamically generated with statements, our results are
                 still promising. Because all the static approaches that
                 disallow the with statement also disallow dynamic code
                 generation, such static approaches can allow the with
                 statement using our rewriting process. We formally
                 present our rewriting strategy, provide its
                 implementation, and show its faithfulness using
                 extensive testing. We believe that removing with
                 statements will simplify JavaScript program analysis
                 designs without considering dynamic scope introduction
                 while imposing fewer syntactic restrictions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Lameed:2014:OMF,
  author =       "Nurudeen A. Lameed and Laurie J. Hendren",
  title =        "Optimizing {MATLAB} {\tt feval} with dynamic
                 techniques",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "85--96",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508174",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "MATLAB is a popular dynamic array-based language used
                 by engineers, scientists and students worldwide. The
                 built-in function feval is an important MATLAB feature
                 for certain classes of numerical programs and solvers
                 which benefit from having functions as parameters.
                 Programmers may pass a function name or function handle
                 to the solver and then the solver uses feval to
                 indirectly call the function. In this paper, we show
                 that there are significant performance overheads for
                 function calls via feval, in both MATLAB interpreters
                 and JITs. The paper then proposes, implements and
                 compares two on-the-fly mechanisms for specialization
                 of feval calls. The first approach uses on-stack
                 replacement technology, as supported by McVM/McOSR. The
                 second approach specializes calls of functions with
                 feval using a combination of runtime input argument
                 types and values. Experimental results on seven
                 numerical solvers show that the techniques provide good
                 performance improvements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Yoo:2014:WRR,
  author =       "Danny Yoo and Shriram Krishnamurthi",
  title =        "{Whalesong}: running {Racket} in the browser",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "2",
  pages =        "97--108",
  month =        feb,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2578856.2508172",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 06:09:05 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  note =         "DLS '13 conference proceedings.",
  abstract =     "JavaScript is the language of the ubiquitous Web, but
                 it only poorly supports event-driven functional
                 programs due to its single-threaded, asynchronous
                 nature and lack of rich control flow operators. We
                 present Whalesong, a compiler from Racket that
                 generates JavaScript code that masks these problems. We
                 discuss the implementation strategy using delimited
                 continuations, an interface to the DOM, and an FFI for
                 adapting JavaScript libraries to add new
                 platform-dependent reactive features. In the process,
                 we also describe extensions to Racket's functional
                 event-driven programming model. We also briefly discuss
                 the implementation details.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '13 conference proceedings.",
}

@Article{Bodik:2014:MBS,
  author =       "Rastislav Bodik",
  title =        "Modeling biology with solver-aided programming
                 languages",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "1--2",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517229",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A good model of a biological cell exposes secrets of
                 the cell's signaling mechanisms, explaining diseases
                 and facilitating drug discovery. Modeling cells is
                 fundamentally a programming problem --- it's
                 programming because the model is a concurrent program
                 that simulates the cell, and it's a problem because it
                 is hard to write a program that reproduces all
                 experimental observations of the cell faithfully. In
                 this talk, I will introduce solver-aided programming
                 languages and show how they ease modeling biology as
                 well as make programming accessible to non-programmers.
                 Solver-aided languages come with constructs that
                 delegate part of the programming problem to a
                 constraint solver, which can be guided to synthesize
                 parts of the program, localize its bugs, or act as a
                 clairvoyant oracle. I will describe our work on
                 synthesis of stem cell models in c. elegans and then
                 show how our framework called Rosette can rapidly
                 implement a solver aided language in several domains,
                 from programming by demonstration to spatial parallel
                 programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Erdweg:2014:FEL,
  author =       "Sebastian Erdweg and Felix Rieger",
  title =        "A framework for extensible languages",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "3--12",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517210",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Extensible programming languages such as SugarJ or
                 Racket enable programmers to introduce customary
                 language features as extensions of the base language.
                 Traditionally, systems that support language extensions
                 are either (i) agnostic to the base language or (ii)
                 only support a single base language. In this paper, we
                 present a framework for language extensibility that
                 turns a non-extensible language into an extensible
                 language featuring library-based extensible syntax,
                 extensible static analyses, and extensible editor
                 support. To make a language extensible, our framework
                 only requires knowledge of the base language's grammar,
                 the syntax for import statements (which activate
                 extensions), and how to compile base-language programs.
                 We have evaluated the generality of our framework by
                 instantiating it for Java, Haskell, Prolog, JavaScript,
                 and System F$_{ \omega }$, and by studying existing
                 module-system features and their support in our
                 framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Flatt:2014:SRY,
  author =       "Matthew Flatt",
  title =        "Submodules in {Racket}: you want it when, again?",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "13--22",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517211",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In an extensible programming language, programmers
                 write code that must run at different times --- in
                 particular, at compile time versus run time. The module
                 system of the Racket programming language enables a
                 programmer to reason about programs in the face of such
                 extensibility, because the distinction between run-time
                 and compile-time phases is built into the language
                 model. Submodules extend Racket's module system to make
                 the phase-separation facet of the language extensible.
                 That is, submodules give programmers the capability to
                 define new phases, such as `test time' or
                 \documentation time,' with the same reasoning and
                 code-management benefits as the built-in distinction
                 between run time and compile time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Dyer:2014:DVE,
  author =       "Robert Dyer and Hridesh Rajan and Tien N. Nguyen",
  title =        "Declarative visitors to ease fine-grained source code
                 mining with full history on billions of {AST} nodes",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "23--32",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517226",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software repositories contain a vast wealth of
                 information about software development. Mining these
                 repositories has proven useful for detecting patterns
                 in software development, testing hypotheses for new
                 software engineering approaches, etc. Specifically,
                 mining source code has yielded significant insights
                 into software development artifacts and processes.
                 Unfortunately, mining source code at a large-scale
                 remains a difficult task. Previous approaches had to
                 either limit the scope of the projects studied, limit
                 the scope of the mining task to be more coarse-grained,
                 or sacrifice studying the history of the code due to
                 both human and computational scalability issues. In
                 this paper we address the substantial challenges of
                 mining source code: (a) at a very large scale; (b) at a
                 fine-grained level of detail; and (c) with full history
                 information. To address these challenges, we present
                 domain-specific language features for source code
                 mining. Our language features are inspired by
                 object-oriented visitors and provide a default
                 depth-first traversal strategy along with two
                 expressions for defining custom traversals. We provide
                 an implementation of these features in the Boa
                 infrastructure for software repository mining and
                 describe a code generation strategy into Java code. To
                 show the usability of our domain-specific language
                 features, we reproduced over 40 source code mining
                 tasks from two large-scale previous studies in just 2
                 person-weeks. The resulting code for these tasks show
                 between $ 2.0 \times $--$ 4.8 \times $ reduction in
                 code size. Finally we perform a small controlled
                 experiment to gain insights into how easily mining
                 tasks written using our language features can be
                 understood, with no prior training. We show a
                 substantial number of tasks (77\%) were understood by
                 study participants, in about 3 minutes per task.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Solodkyy:2014:OPM,
  author =       "Yuriy Solodkyy and Gabriel {Dos Reis} and Bjarne
                 Stroustrup",
  title =        "Open pattern matching for {C++}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "33--42",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517222",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Pattern matching is an abstraction mechanism that can
                 greatly simplify source code. We present
                 functional-style pattern matching for C++ implemented
                 as a library, called Mach7$^1$. All the patterns are
                 user-definable, can be stored in variables, passed
                 among functions, and allow the use of class
                 hierarchies. As an example, we implement common
                 patterns used in functional languages. Our approach to
                 pattern matching is based on compile-time composition
                 of pattern objects through concepts. This is superior
                 (in terms of performance and expressiveness) to
                 approaches based on run-time composition of polymorphic
                 pattern objects. In particular, our solution allows
                 mapping functional code based on pattern matching
                 directly into C++ and produces code that is only a few
                 percent slower than hand-optimized C++ code. The
                 library uses an efficient type switch construct,
                 further extending it to multiple scrutinees and general
                 patterns. We compare the performance of pattern
                 matching to that of double dispatch and open
                 multi-methods in C++.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Martin:2014:TCR,
  author =       "Marko Martin and Mira Mezini and Sebastian Erdweg",
  title =        "Template constructors for reusable object
                 initialization",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "43--52",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517212",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reuse of and abstraction over object initialization
                 logic is not properly supported in mainstream
                 object-oriented languages. This may result in
                 significant amount of boilerplate code and
                 proliferation of constructors in subclasses. It also
                 makes it impossible for mixins to extend the
                 initialization interface of classes they are applied
                 to. We propose template constructors, which employ
                 template parameters and pattern matching of them
                 against signatures of superclass constructors to enable
                 a one-to-many binding of super-calls. We demonstrate
                 how template constructors solve the aforementioned
                 problems. We present a formalization of the concept, a
                 Java-based implementation, and use cases which exercise
                 its strengths.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Richard-Foy:2014:EHL,
  author =       "Julien Richard-Foy and Olivier Barais and Jean-Marc
                 J{\'e}z{\'e}quel",
  title =        "Efficient high-level abstractions for {Web}
                 programming",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "53--60",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517227",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Writing large Web applications is known to be
                 difficult. One challenge comes from the fact that the
                 application's logic is scattered into heterogeneous
                 clients and servers, making it difficult to share code
                 between both sides or to move code from one side to the
                 other. Another challenge is performance: while Web
                 applications rely on ever more code on the client-side,
                 they may run on smart phones with limited hardware
                 capabilities. These two challenges raise the following
                 problem: how to benefit from high-level languages and
                 libraries making code complexity easier to manage and
                 abstracting over the clients and servers differences
                 without trading this ease of engineering for
                 performance? This article presents high-level
                 abstractions defined as deep embedded DSLs in Scala
                 that can generate efficient code leveraging the
                 characteristics of both client and server environments.
                 We compare performance on client-side against other
                 candidate technologies and against hand written
                 low-level JavaScript code. Though code written with our
                 DSL has a high level of abstraction, our benchmark on a
                 real world application reports that it runs as fast as
                 hand tuned low-level JavaScript code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Gerakios:2014:RTP,
  author =       "Prodromos Gerakios and Aggelos Biboudis and Yannis
                 Smaragdakis",
  title =        "Reified type parameters using {Java} annotations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "61--64",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517223",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Java generics are compiled by-erasure: all clients
                 reuse the same bytecode, with uses of the unknown type
                 erased. C++ templates are compiled by-expansion: each
                 type-instantiation of a template produces a different
                 code definition. The two approaches offer trade-offs on
                 multiple axes. We propose an extension of Java generics
                 that allows by-expansion translation relative to
                 selected type parameters only. This language design
                 allows sophisticated users to get the best of both
                 worlds at a fine granularity. Furthermore, our proposal
                 is based on Java 8 Type Annotations (JSR 308) and the
                 Checker Framework as an abstraction layer for
                 controlling compilation without changes to the
                 internals of a Java compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Schulze:2014:DDP,
  author =       "Sandro Schulze and J{\"o}rg Liebig and Janet Siegmund
                 and Sven Apel",
  title =        "Does the discipline of preprocessor annotations
                 matter?: a controlled experiment",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "65--74",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517215",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The C preprocessor ( CPP ) is a simple and
                 language-independent tool, widely used to implement
                 variable software systems using conditional compilation
                 (i.e., by including or excluding annotated code).
                 Although CPP provides powerful means to express
                 variability, it has been criticized for allowing
                 arbitrary annotations that break the underlying
                 structure of the source code. We distinguish between
                 disciplined annotations, which align with the structure
                 of the source code, and undisciplined annotations,
                 which do not. Several studies suggest that especially
                 the latter type of annotations makes it hard to
                 (automatically) analyze the code. However, little is
                 known about whether the type of annotations has an
                 effect on program comprehension. We address this issue
                 by means of a controlled experiment with human
                 subjects. We designed similar tasks for both,
                 disciplined and undisciplined annotations, to measure
                 program comprehension. Then, we measured the
                 performance of the subjects regarding correctness and
                 response time for solving the tasks. Our results
                 suggest that there are no differences between
                 disciplined and undisciplined annotations from a
                 program-comprehension perspective. Nevertheless, we
                 observed that finding and correcting errors is a
                 time-consuming and tedious task in the presence of
                 preprocessor annotations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Medeiros:2014:IPB,
  author =       "Fl{\'a}vio Medeiros and M{\'a}rcio Ribeiro and Rohit
                 Gheyi",
  title =        "Investigating preprocessor-based syntax errors",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "75--84",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517221",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The C preprocessor is commonly used to implement
                 variability in program families. Despite the widespread
                 usage, some studies indicate that the C preprocessor
                 makes variability implementation difficult and
                 error-prone. However, we still lack studies to
                 investigate preprocessor-based syntax errors and
                 quantify to what extent they occur in practice. In this
                 paper, we define a technique based on a
                 variability-aware parser to find syntax errors in
                 releases and commits of program families. To
                 investigate these errors, we perform an empirical study
                 where we use our technique in 41 program family
                 releases, and more than 51 thousand commits of 8
                 program families. We find 7 and 20 syntax errors in
                 releases and commits of program families, respectively.
                 They are related not only to incomplete annotations,
                 but also to complete ones. We submit 8 patches to fix
                 errors that developers have not fixed yet, and they
                 accept 75\% of them. Our results reveal that the time
                 developers need to fix the errors varies from days to
                 years in family repositories. We detect errors even in
                 releases of well-known and widely used program
                 families, such as Bash, CVS and Vim. We also classify
                 the syntax errors into 6 different categories. This
                 classification may guide developers to avoid them
                 during development.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Kramer:2014:UDO,
  author =       "Dean Kramer and Samia Oussena and Peter Komisarczuk
                 and Tony Clark",
  title =        "Using document-oriented {GUIs} in dynamic software
                 product lines",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "85--94",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517214",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic Software Product Line (DSPL) Engineering has
                 gained interest through its promise of being able to
                 unify software adaptation whereby software adaptation
                 can be realised at compile time and runtime. While
                 previous work has enabled program logic adaptation by
                 the use of language extensions and platform support,
                 little attention has been placed on Graphical User
                 Interface (GUI) variability. Different design patterns
                 including the Model View Controller are commonly used
                 in GUI implementation, with GUI documents being used
                 for declaring the GUI. To handle dynamic GUI
                 variability currently, the developer needs to implement
                 GUI refinements using multiple techniques. This paper
                 proposes a solution for dealing with GUI document
                 variability, statically and dynamically, in a unified
                 way. In our approach, we currently use a compile time
                 method for producing GUI variants, and code
                 transformations to handle these variants within the
                 application at runtime. To avoid GUI duplicates, only
                 GUI variants that are unique, and related to a valid
                 product configuration, are produced. To validate our
                 approach, we implemented tool support to enable this
                 for Android based applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Siegmund:2014:FBP,
  author =       "Norbert Siegmund and Alexander von Rhein and Sven
                 Apel",
  title =        "Family-based performance measurement",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "95--104",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517209",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most contemporary programs are customizable. They
                 provide many features that give rise to millions of
                 program variants. Determining which feature selection
                 yields an optimal performance is challenging, because
                 of the exponential number of variants. Predicting the
                 performance of a variant based on previous measurements
                 proved successful, but induces a trade-off between the
                 measurement effort and prediction accuracy. We propose
                 the alternative approach of family-based performance
                 measurement, to reduce the number of measurements
                 required for identifying feature interactions and for
                 obtaining accurate predictions. The key idea is to
                 create a variant simulator (by translating compile-time
                 variability to run-time variability) that can simulate
                 the behavior of all program variants. We use it to
                 measure performance of individual methods, trace
                 methods to features, and infer feature interactions
                 based on the call graph. We evaluate our approach by
                 means of five feature-oriented programs. On average, we
                 achieve accuracy of 98\%, with only a single
                 measurement per customizable program. Observations show
                 that our approach opens avenues of future research in
                 different domains, such an feature-interaction
                 detection and testing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Marek:2014:SRC,
  author =       "Luk{\'a}s Marek and Stephen Kell and Yudi Zheng and
                 Lubom{\'\i}r Bulej and Walter Binder and Petr Tuma and
                 Danilo Ansaloni and Aibek Sarimbekov and Andreas Sewe",
  title =        "{ShadowVM}: robust and comprehensive dynamic program
                 analysis for the {Java} platform",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "105--114",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517219",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic analysis tools are often implemented using
                 instrumentation, particularly on managed runtimes
                 including the Java Virtual Machine (JVM). Performing
                 instrumentation robustly is especially complex on such
                 runtimes: existing frameworks offer limited coverage
                 and poor isolation, while previous work has shown that
                 apparently innocuous instrumentation can cause
                 deadlocks or crashes in the observed application. This
                 paper describes ShadowVM, a system for
                 instrumentation-based dynamic analyses on the JVM which
                 combines a number of techniques to greatly improve both
                 isolation and coverage. These centre on the offload of
                 analysis to a separate process; we believe our design
                 is the first system to enable genuinely full bytecode
                 coverage on the JVM. We describe a working
                 implementation, and use a case study to demonstrate its
                 improved coverage and to evaluate its runtime
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Kolesnikov:2014:CPB,
  author =       "Sergiy Kolesnikov and Alexander von Rhein and Claus
                 Hunsen and Sven Apel",
  title =        "A comparison of product-based, feature-based, and
                 family-based type checking",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "115--124",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517213",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Analyzing software product lines is difficult, due to
                 their inherent variability. In the past, several
                 strategies for product-line analysis have been
                 proposed, in particular, product-based, feature-based,
                 and family-based strategies. Despite recent attempts to
                 conceptually and empirically compare different
                 strategies, there is no work that empirically compares
                 all of the three strategies in a controlled setting. We
                 close this gap by extending a compiler for
                 feature-oriented programming with support for
                 product-based, feature-based, and family-based type
                 checking. We present and discuss the results of a
                 comparative performance evaluation that we conducted on
                 a set of 12 feature-oriented, Java-based product lines.
                 Most notably, we found that the family-based strategy
                 is superior for all subject product lines: it is
                 substantially faster, it detects all kinds of errors,
                 and provides the most detailed information about
                 them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Ofenbeck:2014:SST,
  author =       "Georg Ofenbeck and Tiark Rompf and Alen Stojanov and
                 Martin Odersky and Markus P{\"u}schel",
  title =        "{Spiral} in {Scala}: towards the systematic
                 construction of generators for performance libraries",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "125--134",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517228",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Program generators for high performance libraries are
                 an appealing solution to the recurring problem of
                 porting and optimizing code with every new processor
                 generation, but only few such generators exist to date.
                 This is due to not only the difficulty of the design,
                 but also of the actual implementation, which often
                 results in an ad-hoc collection of standalone programs
                 and scripts that are hard to extend, maintain, or
                 reuse. In this paper we ask whether and which
                 programming language concepts and features are needed
                 to enable a more systematic construction of such
                 generators. The systematic approach we advocate
                 extrapolates from existing generators: (a) describing
                 the problem and algorithmic knowledge using one, or
                 several, domain-specific languages (DSLs), (b)
                 expressing optimizations and choices as rewrite rules
                 on DSL programs, (c) designing data structures that can
                 be configured to control the type of code that is
                 generated and the data representation used, and (d)
                 using autotuning to select the best-performing
                 alternative. As a case study, we implement a small, but
                 representative subset of Spiral in Scala using the
                 Lightweight Modular Staging (LMS) framework. The first
                 main contribution of this paper is the realization of
                 (c) using type classes to abstract over staging
                 decisions, i.e. which pieces of a computation are
                 performed immediately and for which pieces code is
                 generated. Specifically, we abstract over different
                 complex data representations jointly with different
                 code representations including generating loops versus
                 unrolled code with scalar replacement --- a crucial and
                 usually tedious performance transformation. The second
                 main contribution is to provide full support for (a)
                 and (d) within the LMS framework: we extend LMS to
                 support translation between different DSLs and
                 autotuning through search.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Chapin:2014:SNT,
  author =       "Peter Chapin and Christian Skalka and Scott Smith and
                 Michael Watson",
  title =        "{Scalaness\slash nesT}: type specialized staged
                 programming for sensor networks",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "135--144",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517217",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming wireless embedded networks is challenging
                 due to severe limitations on processing speed, memory,
                 and bandwidth. Staged programming can help bridge the
                 gap between high level code refinement techniques and
                 efficient device level programs by allowing a first
                 stage program to specialize device level code. Here we
                 introduce a two stage programming system for wireless
                 sensor networks. The first stage program is written in
                 our extended dialect of Scala, called Scalaness, where
                 components written in our type safe dialect of nesC,
                 called nesT, are composed and specialized. Scalaness
                 programs can dynamically construct TinyOS-compliant
                 nesT device images that can be deployed to motes. A key
                 result, called cross-stage type safety, shows that
                 successful static type checking of a Scalaness program
                 means no type errors will arise either during
                 programmatic composition and specialization of WSN
                 code, or later on the WSN itself. Scalaness has been
                 implemented through direct modification of the Scala
                 compiler. Implementation of a staged public-key
                 cryptography calculation shows the sensor memory
                 footprint can be significantly reduced by staging.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Sujeeth:2014:FGH,
  author =       "Arvind K. Sujeeth and Austin Gibbons and Kevin J.
                 Brown and HyoukJoong Lee and Tiark Rompf and Martin
                 Odersky and Kunle Olukotun",
  title =        "Forge: generating a high performance {DSL}
                 implementation from a declarative specification",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "145--154",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517220",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Domain-specific languages provide a promising path to
                 automatically compile high-level code to parallel,
                 heterogeneous, and distributed hardware. However, in
                 practice high performance DSLs still require
                 considerable software expertise to develop and force
                 users into tool-chains that hinder prototyping and
                 debugging. To address these problems, we present Forge,
                 a new meta DSL for declaratively specifying high
                 performance embedded DSLs. Forge provides DSL authors
                 with high-level abstractions (e.g., data structures,
                 parallel patterns, effects) for specifying their DSL in
                 a way that permits high performance. From this
                 high-level specification, Forge automatically generates
                 both a na{\"\i}ve Scala library implementation of the
                 DSL and a high performance version using the Delite DSL
                 framework. Users of a Forge-generated DSL can prototype
                 their application using the library version, and then
                 switch to the Delite version to run on multicore CPUs,
                 GPUs, and clusters without changing the application
                 code. Forge-generated Delite DSLs perform within 2x of
                 hand-optimized C++ and up to $ 40 \times $ better than
                 Spark, an alternative high-level distributed
                 programming environment. Compared to a manually
                 implemented Delite DSL, Forge provides a factor of
                 $3$--$ 6 \times $ reduction in lines of code and does
                 not sacrifice any performance. Furthermore, Forge
                 specifications can be generated from existing Scala
                 libraries, are easy to maintain, shield DSL developers
                 from changes in the Delite framework, and enable DSLs
                 to be retargeted to other frameworks transparently.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Kurilova:2014:SSL,
  author =       "Darya Kurilova and Derek Rayside",
  title =        "On the simplicity of synthesizing linked data
                 structure operations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "155--158",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517225",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We argue that synthesizing operations on recursive
                 linked data structures is not as hard as it appears and
                 is, in fact, within reach of current SAT-based
                 synthesis techniques --- with the addition of a simple
                 approach that we describe to decompose the problem into
                 smaller parts. To generate smaller pieces of code,
                 i.e., shorter routines, is obviously easier than large
                 and complex routines, and, also, there is more
                 potential for automating the code synthesis. In this
                 paper, we present a code generation algorithm for
                 synthesizing operations of linked data structures and,
                 as an example, describe how the proposed algorithm
                 works to synthesize operations of an AVL tree.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Dhungana:2014:GCD,
  author =       "Deepak Dhungana and Andreas Falkner and Alois
                 Haselb{\"o}ck",
  title =        "Generation of conjoint domain models for
                 system-of-systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "159--168",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517224",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software solutions in complex environments, such as
                 railway control systems or power plants, are assemblies
                 of heterogeneous components, which are very large and
                 complex systems themselves. Interplay of these systems
                 requires a thorough design of a system-of-systems (SoS)
                 encompassing the required interactions between the
                 involved systems. One of the challenges lies in
                 reconciliation of the domain data structures and
                 runtime constraints to ensure consistency of the SoS
                 behavior. In this paper, we present a generative
                 approach that enables reconciliation of a common
                 platform based on reusable domain models of the
                 involved systems. This is comparable to a product line
                 configuration problem where we generate a common
                 platform model for all involved systems. We discuss the
                 specific requirements for model composition in a SoS
                 context and address them in our approach. In
                 particular, our approach addresses the operational and
                 managerial independence of the individual systems and
                 offers appropriate modeling constructs. We report on
                 our experiences of applying the approach in several
                 real world projects and share the lessons learned.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Basso:2014:SLS,
  author =       "F{\'a}bio Paulo Basso and Raquel Mainardi Pillat and
                 Toacy Cavalcante Oliveira and Leandro Buss Becker",
  title =        "Supporting large scale model transformation reuse",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "169--178",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517218",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The growth of applications developed with the support
                 of model transformations makes reuse a required
                 practice, specially when applied to transformation
                 assets (e.g. transformation chains, algorithms, and
                 configuration files). In order to promote reuse one
                 must consider the different implementations,
                 communalities, and variants among these assets. In this
                 domain, a couple techniques have been used as solutions
                 to adapt reusable assets for specific needs. However,
                 so far, no work has discussed their combined use in
                 real software projects. In this paper, we present a new
                 tool named WCT, which can be used to adapt
                 transformation assets. Moreover, through lessons
                 learned in industry, we address some reuse techniques
                 devoted to adapt these assets.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{An:2014:MDG,
  author =       "Kyoungho An and Takayuki Kuroda and Aniroddha Gokhale
                 and Sumant Tambe and Andrea Sorbini",
  title =        "Model-driven generative framework for automated {OMG
                 DDS} performance testing in the cloud",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "3",
  pages =        "179--182",
  month =        mar,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2637365.2517216",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 26 05:58:25 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Object Management Group's (OMG) Data Distribution
                 Service (DDS) provides many configurable policies which
                 determine end-to-end quality of service (QoS) of
                 applications. It is challenging to predict the system's
                 performance in terms of latencies, throughput, and
                 resource usage because diverse combinations of QoS
                 configurations influence QoS of applications in
                 different ways. To overcome this problem, design-time
                 formal methods have been applied with mixed success,
                 but lack of sufficient accuracy in prediction, tool
                 support, and understanding of formalism has prevented
                 wider adoption of the formal techniques. A promising
                 approach to address this challenge is to emulate system
                 behavior and gather data on the QoS parameters of
                 interest by experimentation. To realize this approach,
                 which is preferred over formal methods due to their
                 limitations in accurately predicting QoS, we have
                 developed a model-based automatic performance testing
                 framework with generative capabilities to reduce manual
                 efforts in generating a large number of relevant QoS
                 configurations that can be deployed and tested on a
                 cloud platform. This paper describes our initial
                 efforts in developing and using this technology.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '13 conference proceedings.",
}

@Article{Vitek:2014:SCR,
  author =       "Jan Vitek",
  title =        "{SIGPLAN Chair}'s report",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "1--1",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641640",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gibbons:2014:SVC,
  author =       "Jeremy Gibbons",
  title =        "{SIGPLAN Vice-Chair}'s report",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "2--2",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641641",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Black:2014:SSR,
  author =       "Andrew Black",
  title =        "{SIGPLAN Secretary}'s report",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "3--3",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641642",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lopes:2014:STR,
  author =       "Cristina V. Lopes",
  title =        "{SIGPLAN Treasurer}'s report",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "4--4",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641643",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Dreyer:2014:SA,
  author =       "Derek Dreyer",
  title =        "{SIGPLAN} awards",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "5--7",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641644",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lawall:2014:SPA,
  author =       "Julia Lawall and Cristina V. Lopes",
  title =        "{SIGPLAN Professional Activities Committee Report}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "8--8",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641645",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hind:2014:SRH,
  author =       "Michael Hind",
  title =        "{SIGPLAN Research Highlights Annual Report}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "9--9",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641646",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Sewell:2014:PPC,
  author =       "Peter Sewell",
  title =        "{POPL 2014 Program Chair}'s report",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "10--26",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641647",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This note describes the POPL 2014 paper selection
                 process and its rationale.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Lopes:2014:OTP,
  author =       "Cristina V. Lopes",
  title =        "The {OOPSLA} two-phase review process",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "27--32",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641648",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Boehm:2014:PP,
  author =       "Hans Boehm and Jack Davidson and Kathleen Fisher and
                 Cormac Flanagan and Jeremy Gibbons and Mary Hall and
                 Graham Hutton and David Padua and Frank Tip and Jan
                 Vitek and Philip Wadler",
  title =        "Practices of {PLDI}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "33--38",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641649",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Fahndrich:2014:SAS,
  author =       "Manuel F{\"a}hndrich and Francesco Logozzo",
  title =        "{SAS2013} artifact submission experience report",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "39--40",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641650",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Gay:2014:NLH,
  author =       "David Gay and Philip Levis and Robert von Behren and
                 Matt Welsh and Eric Brewer and David Culler",
  title =        "The {nesC} language: a holistic approach to networked
                 embedded systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "41--51",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641652",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present nesC, a programming language for networked
                 embedded systems that represent a new design space for
                 application developers. An example of a networked
                 embedded system is a sensor network, which consists of
                 (potentially) thousands of tiny, low-power ``motes,''
                 each of which execute concurrent, reactive programs
                 that must operate with severe memory and power
                 constraints. nesC's contribution is to support the
                 special needs of this domain by exposing a programming
                 model that incorporates event-driven execution, a
                 flexible concurrency model, and component-oriented
                 application design. Restrictions on the programming
                 model allow the nesC compiler to perform whole-program
                 analyses, including data-race detection (which improves
                 reliability) and aggressive function inlining (which
                 reduces resource consumption). nesC has been used to
                 implement TinyOS, a small operating system for sensor
                 networks, as well as several significant sensor
                 applications. nesC and TinyOS have been adopted by a
                 large number of sensor network research groups, and our
                 experience and evaluation of the language shows that it
                 is effective at supporting the complex, concurrent
                 programming style demanded by this new class of deeply
                 networked systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{LeBotlan:2014:MRM,
  author =       "Didier {Le Botlan} and Didier R{\'e}my",
  title =        "{MLF}: raising {ML} to the power of {System F}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "52--63",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641653",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a type system MLF that generalizes ML with
                 first-class polymorphism as in System F. Expressions
                 may contain second-order type annotations. Every
                 typable expression admits a principal type, which
                 however depends on type annotations. Principal types
                 capture all other types that can be obtained by
                 implicit type instantiation and they can be inferred.
                 All expressions of ML are well-typed without any
                 annotations. All expressions of System F can be
                 mechanically encoded into MLF by dropping all type
                 abstractions and type applications, and injecting types
                 of lambda-abstractions into MLF types. Moreover, only
                 parameters of lambda-abstractions that are used
                 polymorphically need to remain annotated.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Harris:2014:LSL,
  author =       "Tim Harris and Keir Fraser",
  title =        "Language support for lightweight transactions",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "64--78",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641654",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent programming is notoriously difficult.
                 Current abstractions are intricate and make it hard to
                 design computer systems that are reliable and scalable.
                 We argue that these problems can be addressed by moving
                 to a declarative style of concurrency control in which
                 programmers directly indicate the safety properties
                 that they require. In our scheme the programmer demarks
                 sections of code which execute within lightweight
                 software-based transactions that commit atomically and
                 exactly once. These transactions can update shared
                 data, instantiate objects, invoke library features and
                 so on. They can also block, waiting for arbitrary
                 boolean conditions to become true. Transactions which
                 do not access the same shared memory locations can
                 commit concurrently. Furthermore, in general, no
                 performance penalty is incurred for memory accesses
                 outside transactions. We present a detailed design of
                 this proposal along with an implementation and
                 evaluation. We argue that the resulting system (i ) is
                 easier for mainstream programmers to use, (ii )
                 prevents lock-based priority-inversion and deadlock
                 problems and (iii ) can offer performance advantages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Henzinger:2014:AP,
  author =       "Thomas A. Henzinger and Ranjit Jhala and Rupak
                 Majumdar and Kenneth L. McMillan",
  title =        "Abstractions from proofs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "4S",
  pages =        "79--91",
  month =        apr,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2641638.2641655",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:36:32 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The success of model checking for large programs
                 depends crucially on the ability to efficiently
                 construct parsimonious abstractions. A predicate
                 abstraction is parsimonious if at each control
                 location, it specifies only relationships between
                 current values of variables, and only those which are
                 required for proving correctness. Previous methods for
                 automatically refining predicate abstractions until
                 sufficient precision is obtained do not systematically
                 construct parsimonious abstractions: predicates usually
                 contain symbolic variables, and are added heuristically
                 and often uniformly to many or all control locations at
                 once. We use Craig interpolation to efficiently
                 construct, from a given abstract error trace which
                 cannot be concretized, a parsimonious abstraction that
                 removes the trace. At each location of the trace, we
                 infer the relevant predicates as an interpolant between
                 the two formulas that define the past and the future
                 segment of the trace. Each interpolant is a
                 relationship between current values of program
                 variables, and is relevant only at that particular
                 program location. It can be found by a linear scan of
                 the proof of infeasibility of the trace.We develop our
                 method for programs with arithmetic and pointer
                 expressions, and call-by-value function calls. For
                 function calls, Craig interpolation offers a systematic
                 way of generating relevant predicates that contain only
                 the local variables of the function and the values of
                 the formal parameters when the function was called. We
                 have extended our model checker BLAST with predicate
                 discovery by Craig interpolation, and applied it
                 successfully to C programs with more than 130,000 lines
                 of code, which was not possible with approaches that
                 build less parsimonious abstractions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Kulkarni:2014:EED,
  author =       "Prasad A. Kulkarni",
  title =        "Energy efficient data access techniques",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "1--1",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2602568",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy has become a first class design constraint for
                 all types of processors. Data accesses contribute to
                 processor energy usage and can account for up to 25\%
                 of the total energy used in embedded processors. Using
                 a set-associative level-one data cache (L1 DC)
                 organization is particularly energy inefficient as load
                 operations access all L1 DC tag and data arrays in
                 parallel to reduce access latency, but the data can
                 reside in at most one way. Techniques that reduce L1 DC
                 energy usage at the expense of degrading performance,
                 such as filter caches, have not been adopted. In this
                 presentation I will describe various techniques we have
                 developed to reduce the energy usage for L1 DC accesses
                 without adversely affecting performance. These
                 techniques include avoiding unnecessary loads from L1
                 DC data arrays and a practical data filter cache design
                 that not only significantly reduces data access energy
                 usage, but also avoids the traditional execution time
                 penalty associated with data filter caches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Spink:2014:ECG,
  author =       "Tom Spink and Harry Wagstaff and Bj{\"o}rn Franke and
                 Nigel Topham",
  title =        "Efficient code generation in a region-based dynamic
                 binary translator",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "3--12",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597810",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Region-based JIT compilation operates on translation
                 units comprising multiple basic blocks and, possibly
                 cyclic or conditional, control flow between these. It
                 promises to reconcile aggressive code optimisation and
                 low compilation latency in performance-critical dynamic
                 binary translators. Whilst various region selection
                 schemes and isolated code optimisation techniques have
                 been investigated it remains unclear how to best
                 exploit such regions for efficient code generation.
                 Complex interactions with indirect branch tables and
                 translation caches can have adverse effects on
                 performance if not considered carefully. In this paper
                 we present a complete code generation strategy for a
                 region-based dynamic binary translator, which exploits
                 branch type and control flow profiling information to
                 improve code quality for the common case. We
                 demonstrate that using our code generation strategy a
                 competitive region-based dynamic compiler can be built
                 on top of the LLVM JIT compilation framework. For the
                 ARM-V5T target ISA and SPEC CPU 2006 benchmarks we
                 achieve execution rates of, on average, 867 MIPS and up
                 to 1323 MIPS on a standard X86 host machine,
                 outperforming state-of-the-art QEMU-ARM by delivering a
                 speedup of 264\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Lezuo:2014:COC,
  author =       "Roland Lezuo and Philipp Paulweber and Andreas Krall",
  title =        "{CASM}: optimized compilation of abstract state
                 machines",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "13--22",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597813",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper we present CASM, a language based on
                 Abstract State Machines (ASM), and its optimizing
                 compiler. ASM is a well-defined (formal) method based
                 on algebraic concepts. A distinct feature of ASM is its
                 combination of parallel and sequential execution
                 semantics. This makes it an excellent choice to
                 formally specify and verify micro-architectures. We
                 present a compilation scheme and an implementation of a
                 runtime system supporting efficient execution of ASM.
                 After introducing novel analysis techniques we present
                 optimizations allowing us to eliminate many costly
                 operations. Benchmark results show that our baseline
                 compiler is 2-3 magnitudes faster than other ASM
                 implementations. The optimizations further increase the
                 performance of the compiled programs up to 264\%. The
                 achieved performance allows our ASM implementation to
                 be used with industry-size applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Lozano:2014:CSC,
  author =       "Roberto Casta{\~n}eda Lozano and Mats Carlsson and
                 Gabriel Hjort Blindell and Christian Schulte",
  title =        "Combinatorial spill code optimization and ultimate
                 coalescing",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "23--32",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597815",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a novel combinatorial model that
                 integrates global register allocation based on ultimate
                 coalescing, spill code optimization, register packing,
                 and multiple register banks with instruction scheduling
                 (including VLIW). The model exploits alternative
                 temporaries that hold the same value as a new concept
                 for ultimate coalescing and spill code optimization.
                 The paper presents Unison as a code generator based on
                 the model and advanced solving techniques using
                 constraint programming. Thorough experiments using
                 MediaBench and a processor (Hexagon) that are typical
                 for embedded systems demonstrate that Unison: is robust
                 and scalable; generates faster code than LLVM (up to
                 41\% with a mean improvement of 7\%); possibly
                 generates optimal code (for 29\% of the experiments);
                 effortlessly supports different optimization criteria
                 (code size on par with LLVM). Unison is significant as
                 it addresses the same aspects as traditional code
                 generation algorithms, yet is based on a simple
                 integrated model and robustly can generate optimal
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Ballabriga:2014:CRP,
  author =       "Cl{\'e}ment Ballabriga and Lee Kee Chong and Abhik
                 Roychoudhury",
  title =        "Cache-related preemption delay analysis for {FIFO}
                 caches",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "33--42",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597814",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hard real-time systems are typically composed of
                 multiple tasks, subjected to timing constraints. To
                 guarantee that these constraints will be respected, the
                 Worst-Case Response Time (WCRT) of each task is needed.
                 In the presence of systems supporting preemptible
                 tasks, we need to take into account the time lost due
                 to task preemption. A major part of this delay is the
                 Cache-Related Preemption Delay (CRPD), which represents
                 the penalties due to cache block evictions by
                 preempting tasks. Previous works on CRPD have focused
                 on caches with Least Recently used (LRU) replacement
                 policy. However, for many real-world processors such as
                 ARM9 or ARM11, the use of First-in-first-out (FIFO)
                 cache replacement policy is common. In this paper, we
                 propose an approach to compute CRPD in the presence of
                 instruction caches with FIFO replacement policy. We use
                 the result of a FIFO instruction cache categorization
                 analysis to account for single-task cache misses, and
                 we model as an Integer Linear Programming (ILP) system
                 the additional preemption-related cache misses. We
                 study the effect of cache related timing anomalies, our
                 work is the first to deal with the effect of timing
                 anomalies in CRPD computation. We also present a WCRT
                 computation method that takes advantage of the fact
                 that our computed CRPD does not increase linearly with
                 respect to the preemption count. We evaluated our
                 method by computing the CRPD with realistic benchmarks
                 (e.g. drone control application, robot controller
                 application), under various cache configuration
                 parameters. The experimentation shows that our method
                 is able to compute tight CRPD bound for benchmark
                 tasks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Henry:2014:HCW,
  author =       "Julien Henry and Mihail Asavoae and David Monniaux and
                 Claire Ma{\"\i}za",
  title =        "How to compute worst-case execution time by
                 optimization modulo theory and a clever encoding of
                 program semantics",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "43--52",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597817",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In systems with hard real-time constraints, it is
                 necessary to compute upper bounds on the worst-case
                 execution time (WCET) of programs; the closer the bound
                 to the real WCET, the better. This is especially the
                 case of synchronous reactive control loops with a fixed
                 clock; the WCET of the loop body must not exceed the
                 clock period. We compute the WCET (or at least a close
                 upper bound thereof) as the solution of an optimization
                 modulo theory problem that takes into account the
                 semantics of the program, in contrast to other methods
                 that compute the longest path whether or not it is
                 feasible according to these semantics. Optimization
                 modulo theory extends satisfiability modulo theory
                 (SMT) to maximization problems. Immediate encodings of
                 WCET problems into SMT yield formulas intractable for
                 all current production-grade solvers --- this is
                 inherent to the DPLL(T) approach to SMT implemented in
                 these solvers. By conjoining some appropriate ``cuts''
                 to these formulas, we considerably reduce the
                 computation time of the SMT-solver. We experimented our
                 approach on a variety of control programs, using the
                 OTAWA analyzer both as baseline and as underlying
                 microarchitectural analysis for our analysis, and show
                 notable improvement on the WCET bound on a variety of
                 benchmarks and control programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Zheng:2014:WAD,
  author =       "Wenguang Zheng and Hui Wu",
  title =        "{WCET}: aware dynamic instruction cache locking",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "53--62",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597820",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Caches are widely used in embedded systems to bridge
                 the increasing speed gap between processors and
                 off-chip memory. However, caches make it significantly
                 harder to compute the WCET(Worst Case Execution Time)
                 of a task. To alleviate this problem, cache locking has
                 been proposed. We investigate the I-cache locking
                 problem, and propose a WCET-aware, min-cut based
                 dynamic instruction cache locking approach for reducing
                 the WCET of a single task. We have implemented our
                 approach and compared it with the two state-of-the-art
                 cache locking approaches by using a set of benchmarks
                 from the MRTC benchmark suite. The experimental results
                 show that our approach achieves the average
                 improvements of 41\%, 15\% and 7\% over the partial
                 locking approach for the 256B, 512B and 1KB caches,
                 respectively, and 7\%, 18\% and 17\% over the longest
                 path based dynamic locking approach for the 256B, 512B
                 and 1KB caches, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Martins:2014:ECO,
  author =       "Luiz G. A. Martins and Ricardo Nobre and Alexandre C.
                 B. Delbem and Eduardo Marques and Jo{\~a}o M. P.
                 Cardoso",
  title =        "Exploration of compiler optimization sequences using
                 clustering-based selection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "63--72",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597821",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Due to the large number of optimizations provided in
                 modern compilers and to compiler optimization specific
                 opportunities, a Design Space Exploration (DSE) is
                 necessary to search for the best sequence of compiler
                 optimizations for a given code fragment (e.g.,
                 function). As this exploration is a complex and time
                 consuming task, in this paper we present DSE strategies
                 to select optimization sequences to both improve the
                 performance of each function and reduce the exploration
                 time. The DSE is based on a clustering approach which
                 groups functions with similarities and then explore the
                 reduced search space provided by the optimizations
                 previously suggested for the functions in each group.
                 The identification of similarities between functions
                 uses a data mining method which is applied to a
                 symbolic code representation of the source code. The
                 DSE process uses the reduced set identified by
                 clustering in two ways: as the design space or as the
                 initial configuration. In both ways, the adoption of a
                 pre-selection based on clustering allows the use of
                 simple and fast DSE algorithms. Our experiments for
                 evaluating the effectiveness of the proposed approach
                 address the exploration of compiler optimization
                 sequences considering 49 compilation passes and
                 targeting a Xilinx MicroBlaze processor, and were
                 performed aiming performance improvements for 41
                 functions. Experimental results reveal that the use of
                 our new clustering-based DSE approach achieved a
                 significant reduction on the total exploration time of
                 the search space (18x over a Genetic Algorithm approach
                 for DSE) at the same time that important performance
                 speedups (43\% over the baseline) were obtained by the
                 optimized codes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Chandramohan:2014:PDP,
  author =       "Kiran Chandramohan and Michael F. P. O'Boyle",
  title =        "Partitioning data-parallel programs for heterogeneous
                 {MPSoCs}: time and energy design space exploration",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "73--82",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597822",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multiprocessor System-on-Chips(MPSoCs) are now widely
                 used in embedded devices. MPSoCs typically contain a
                 range of specialised processors. Alongside the CPU,
                 there are microcontrollers, DSPs and other hardware
                 accelerators. Programming these MPSoCs is difficult
                 because of the difference in instruction-set
                 architecture (ISA) and disjoint address spaces. In this
                 paper we consider MPSoCs as a target for individual
                 benchmarks. We examine how data-parallel programs can
                 be optimally mapped to heterogeneous multicores for
                 different criteria such as performance, power and
                 energy. We investigate the partitioning of seven
                 benchmarks taken from DSPstone, UTDSP and Polybench
                 suites. Based on design space exploration we show that
                 the best partition depends on compiler optimization
                 level, program, input size and crucially optimization
                 criteria. We develop a straightforward approach that
                 attempts to select the best partitioning for a given
                 program. On average it achieves speedups of 2.2x and
                 energy improvements of 1.45x on the OMAP 4430
                 platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Guo:2014:EED,
  author =       "Minyi Guo",
  title =        "Energy efficient data access and storage through
                 {HW\slash SW} co-design",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "83--83",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2602569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Massive energy consumption has become a major factor
                 for the design and implementation of datacenters. This
                 has led to numerous academic and industrial efforts to
                 improve the energy efficiency of datacenter
                 infrastructures. As a result, in state-of-the-art
                 datacenter facilities, over 80\% of power is now
                 consumed by servers themselves. Historically, the
                 processor has dominated energy consumption in the
                 server. However, as processors have become more energy
                 efficient, their contribution has been decreasing. On
                 the contrary, energy consumed by data accesses and
                 storage is growing, since multi- and many-core severs
                 are requiring increased main memory bandwidth/capacity,
                 large register file and large-scale storage system.
                 Accordingly, energy consumed by data accesses and
                 storage approaching or even surpassing that consumed by
                 processors in many servers. For example, it has been
                 reported that main memory contributes to as much as
                 40-46\% of total energy consumption in server
                 applications. In this talk, we present our continuing
                 efforts to improve the energy efficiency of data
                 accesses and storage. We study on a series of
                 approaches with hardware-software cooperation to save
                 energy consumption of on-chip memory, register file,
                 main memory and storage devices for embedded systems,
                 multi- and many-core servers, respectively. Experiments
                 with a large set of workloads show the accuracy of our
                 analytical models and the effectiveness of our
                 optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{vonKoch:2014:EFS,
  author =       "Tobias J. K. Edler von Koch and Bj{\"o}rn Franke and
                 Pranav Bhandarkar and Anshuman Dasgupta",
  title =        "Exploiting function similarity for code size
                 reduction",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "85--94",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597811",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For cost-sensitive or memory constrained embedded
                 systems, code size is at least as important as
                 performance. Consequently, compact code generation has
                 become a major focus of attention within the compiler
                 community. In this paper we develop a pragmatic, yet
                 effective code size reduction technique, which exploits
                 structural similarity of functions. It avoids code
                 duplication through merging of similar functions and
                 targeted insertion of control flow to resolve small
                 differences. We have implemented our purely software
                 based and platform-independent technique in the LLVM
                 compiler frame work and evaluated it against the SPEC
                 CPU2006 benchmarks and three target platforms: Intel
                 x86, ARM based Qualcomm Krait(TM), and Qualcomm
                 Hexagon(TM) DSP. We demonstrate that code size for SPEC
                 CPU2006 can be reduced by more than 550KB on x86. This
                 corresponds to an overall code size reduction of 4\%,
                 and up to 11.5\% for individual programs. Overhead
                 introduced by additional control flow is compensated
                 for by better I-cache performance of the compacted
                 programs. We also show that identifying suitable
                 candidates and subsequent merging of functions can be
                 implemented efficiently.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Roy:2014:AAS,
  author =       "Pooja Roy and Rajarshi Ray and Chundong Wang and Weng
                 Fai Wong",
  title =        "{ASAC}: automatic sensitivity analysis for approximate
                 computing",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "95--104",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597812",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The approximation based programming paradigm is
                 especially attractive for developing error-resilient
                 applications, targeting low power embedded devices. It
                 allows for program data to be computed and stored
                 approximately for better energy efficiency. The
                 duration of battery in the smartphones, tablets, etc.
                 is generally more of a concern to users than an
                 application's accuracy or fidelity beyond certain
                 acceptable quality of service. Therefore, relaxing
                 accuracy to improve energy efficiency is an attractive
                 trade-off when permissible by the application's domain.
                 Recent works suggest source code annotations and type
                 qualifiers to facilitate safe approximate computation
                 and data manipulation. It requires rewriting of
                 programs or the availability of source codes for
                 annotations. This may not be feasible as real-world
                 applications tend to be large, with source code that is
                 not readily available. In this paper, we propose a
                 novel sensitivity analysis that automatically generates
                 annotations for programs for the purpose of approximate
                 computing. Our framework, ASAC, extracts information
                 about the sensitivity of the output with respect to
                 program data. We show that the program output is
                 sensitive to only a subset of program data that we deem
                 critical, and hence must be precise. The rest of the
                 data can be computed and stored approximately.We
                 evaluated our analysis on a range of applications, and
                 achieved a 86\% accuracy compared to manual annotations
                 by programmers. We validated our analysis by showing
                 that the applications are within the acceptable QoS
                 threshold if we approximate the non-critical data.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Chaudhary:2014:ESC,
  author =       "Sandeep Chaudhary and Sebastian Fischmeister and Lin
                 Tan",
  title =        "{em-SPADE}: a compiler extension for checking rules
                 extracted from processor specifications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "105--114",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597823",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Traditional compilers ignore processor specifications,
                 thousands of pages of which are available for modern
                 processors. To bridge this gap, em-SPADE analyzes
                 processor specifications and creates processor-specific
                 rules to reduce low-level programming errors. This work
                 shows the potential of automatically analyzing
                 processor- and other hardware specifications to detect
                 low-level programming errors at compile time. em-SPADE
                 is a compiler extension to automatically detect
                 software bugs in low-level programs. From processor
                 specifications, a preprocessor extracts target-specific
                 rules such as register use and read-only or reserved
                 registers. A special LLVM pass then uses these rules to
                 detect incorrect register assignments. Our experiments
                 with em-SPADE have correctly extracted 652 rules from
                 15 specifications and consequently found 20 bugs in ten
                 software projects. The work is generalizable to other
                 types of specifications and shows the clear prospects
                 of using hardware specifications to enhance
                 compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Beaugnon:2014:VVO,
  author =       "Ulysse Beaugnon and Alexey Kravets and Sven van
                 Haastregt and Riyadh Baghdadi and David Tweed and Javed
                 Absar and Anton Lokhmotov",
  title =        "{VOBLA}: a vehicle for optimized basic linear
                 algebra",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "115--124",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597818",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present VOBLA, a domain-specific language designed
                 for programming linear algebra libraries. VOBLA is
                 compiled to PENCIL, a domain independent intermediate
                 language designed for efficient mapping to accelerator
                 architectures such as GPGPUs. PENCIL is compiled to
                 efficient, platform-specific OpenCL code using
                 techniques based on the polyhedral model. This approach
                 addresses both the programmer productivity and
                 performance portability concerns associated with
                 accelerator programming. We demonstrate our approach by
                 using VOBLA to implement a BLAS library. We have
                 evaluated the performance of OpenCL code generated
                 using our compilation flow on ARM Mali, AMD Radeon, and
                 AMD Opteron platforms. The generated code is currently
                 on average 1.9x slower than highly hand-optimized
                 OpenCL code, but on average 8.1x faster than
                 straightforward OpenCL code. Given that the VOBLA
                 coding takes significantly less effort compared to
                 hand-optimizing OpenCL code, we believe our approach
                 leads to improved productivity and performance
                 portability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Bebelis:2014:FSP,
  author =       "Vagelis Bebelis and Pascal Fradet and Alain Girault",
  title =        "A framework to schedule parametric dataflow
                 applications on many-core platforms",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "125--134",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597819",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dataflow models, such as SDF, have been effectively
                 used to program streaming applications while ensuring
                 their liveness and boundedness. Yet, industrials are
                 struggling to design the next generation of high
                 definition video applications using these models. Such
                 applications demand new features such as parameters to
                 express dynamic input/output rate and topology
                 modifications. Their implementation on modern many-core
                 platforms is a major challenge. We tackle these
                 problems by proposing a generic and flexible framework
                 to schedule streaming applications designed in a
                 parametric dataflow model of computation. We generate
                 parallel as soon as possible (ASAP) schedules targeted
                 to the new STHORM many-core platform of
                 STMicroelectronics. Furthermore, these schedules can be
                 customized using user-defined ordering and resource
                 constraints. The parametric dataflow graph is
                 associated with generic or user-defined specific
                 constraints aimed at minimizing timing, buffer sizes,
                 power consumption, or other criteria. The scheduling
                 algorithm executes with minimal overhead and can be
                 adapted to different scheduling policies just by adding
                 some constraints. The safety of both the dataflow graph
                 and constraints can be checked statically and all
                 schedules are guaranteed to be bounded and deadlock
                 free. We illustrate the scheduling capabilities of our
                 approach using a real world application: the VC-1 video
                 decoder for high definition video streaming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Lee:2014:IPL,
  author =       "Jinyong Lee and Jongwon Lee and Jongeun Lee and
                 Yunheung Paek",
  title =        "Improving performance of loops on {DIAM-based} {VLIW}
                 architectures",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "135--144",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597825",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent studies show that very long instruction word
                 (VLIW) architectures, which inherently have wide
                 datapath (e.g. 128 or 256 bits for one VLIW instruction
                 word), can benefit from dynamic implied addressing mode
                 (DIAM) and can achieve lower power consumption and
                 smaller code size with a small performance overhead.
                 Such overhead, which is claimed to be small, is mainly
                 caused by the execution of additionally generated
                 special instructions for conveying information that
                 cannot be encoded in reduced instruction bit-width. In
                 this paper, however, we show that the performance
                 impact of applying DIAM on VLIW architecture cannot be
                 overlooked expecially when applications possess high
                 level of instruction level parallelism (ILP), which is
                 mostly the case for loops because of the result of
                 aggressive code scheduling. We also propose a way to
                 relieve the performance degradation especially focusing
                 on loops since loops spend almost 90\% of total
                 execution time in programs and tend to have high ILP.
                 We first implement the original DIAM compilation
                 technique in a compiler, and augment it with the
                 proposed loop optimization scheme to show that ours can
                 clearly alleviate the performance loss caused by the
                 excessive number of additional instructions, with the
                 help of slightly modified hardware. Moreover, the
                 well-known loop unrolling scheme, which would produce
                 denser code in loops at the cost of substantial code
                 size bloating, is integrated into our compiler. The
                 experiment result shows that the loop unrolling
                 technique, combined with our augmented DIAM scheme,
                 produces far better code in terms of performance with
                 quite an acceptable amount of code increase.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Wingbermuehle:2014:SMS,
  author =       "Joseph G. Wingbermuehle and Ron K. Cytron and Roger D.
                 Chamberlain",
  title =        "Superoptimization of memory subsystems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "145--154",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597816",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The disparity in performance between processors and
                 main memories has led computer architects to
                 incorporate large cache hierarchies in modern
                 computers. Because these cache hierarchies are designed
                 to be general-purpose, they may not provide the best
                 possible performance for a given application. In this
                 paper, we determine a memory subsystem well suited for
                 a given application and main memory by discovering a
                 memory subsystem comprised of caches,scratchpads, and
                 other components that are combined to provide better
                 performance. We draw motivation from the
                 superoptimization of instruction sequences, which
                 successfully finds unusually clever instruction
                 sequences for programs. Targeting both ASIC and FPGA
                 devices, we show that it is possible to discover
                 unusual memory subsystems that provide performance
                 improvements over a typical memory subsystem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Kim:2014:LBL,
  author =       "Hongjune Kim and Seonmyeong Bak and Jaejin Lee",
  title =        "Lightweight and block-level concurrent sweeping for
                 {JavaScript} garbage collection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "5",
  pages =        "155--164",
  month =        may,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666357.2597824",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:37:30 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "JavaScript is a dynamic-typed language originally
                 developed for the purpose of giving dynamic client-side
                 behaviors to web pages. It is mainly used in web
                 application development and because of its popularity
                 and rapid development style it is now also used in
                 other types of applications. Increasing data processing
                 requirements and growing usage in more resource-limited
                 environments, such as mobile devices, has given demands
                 for JavaScript implementations to handle memory more
                 efficiently through garbage collection. Since
                 aggressive use of time consuming operations in garbage
                 collection can slow down the JavaScript application,
                 there is a trade-off relationship between the
                 effectiveness and the execution time of garbage
                 collection. In this paper, we present a lightweight,
                 block-level concurrent sweeping mechanism for a
                 mark-and-sweep garbage collector. The sweeping process
                 is detached to an additional thread to eagerly collect
                 free memory blocks and recycle it. To minimize the
                 overhead that comes from the synchronization between
                 the mutator thread and the new sweeping thread, we have
                 chosen a course grained block-level collecting scheme
                 for sweeping. To avoid contention that comes from
                 object destruction, we execute the object destruction
                 phase concurrently with the foreground marking phase.
                 We have implemented our algorithm in JavaScript Core
                 (JSC) engine embedded in the WebKit browser that uses a
                 variant of mark-and-sweep algorithm to manage
                 JavaScript objects. The original garbage collection
                 implementation performs lazy sweeping that cannot reuse
                 the free blocks. We evaluate our implementation on an
                 ARM-based mobile system and show that memory
                 utilization of the system is significantly improved
                 without performance degradation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '14 conference proceedings.",
}

@Article{Padua:2014:WEI,
  author =       "David Padua",
  title =        "What exactly is inexact computation good for?",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "1--1",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2604001",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Our willingness to deliberately trade accuracy of
                 computing systems for significant resource savings,
                 notably energy consumption, got a boost from two
                 directions. First, energy (or power, the more popularly
                 used measure) consumption started emerging as a serious
                 hurdle to our ability to continue scaling the
                 complexity of processors, and thus enable ever richer
                 computing applications. This ``energy hurdle'' spanned
                 the gamut from large data-centers to portable embedded
                 computing systems. Second, many believed that an engine
                 of growth that supported scaling, captured by Gordon
                 Moore's remarkable prophecy (Moore's law), was headed
                 towards an irrevocable cliff edge --- when this
                 happens, our ability to produce computing systems whose
                 hardware would support precise or exact computing would
                 diminish greatly. In this talk which emphasizes the
                 physical and hardware layers of abstraction where all
                 of these troubles start (after all energy is rooted in
                 thermodynamics), I will first review reasons that
                 compelled and encouraged us to consider trading
                 accuracy for energy savings deliberately resulting in
                 inexact computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Kuper:2014:TPE,
  author =       "Lindsey Kuper and Aaron Todd and Sam Tobin-Hochstadt
                 and Ryan R. Newton",
  title =        "Taming the parallel effect zoo: extensible
                 deterministic parallelism with {LVish}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "2--14",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594312",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A fundamental challenge of parallel programming is to
                 ensure that the observable outcome of a program remains
                 deterministic in spite of parallel execution.
                 Language-level enforcement of determinism is possible,
                 but existing deterministic-by-construction parallel
                 programming models tend to lack features that would
                 make them applicable to a broad range of problems.
                 Moreover, they lack extensibility: it is difficult to
                 add or change language features without breaking the
                 determinism guarantee. The recently proposed LVars
                 programming model, and the accompanying LVish Haskell
                 library, took a step toward broadly-applicable
                 guaranteed-deterministic parallel programming. The
                 LVars model allows communication through shared
                 monotonic data structures to which information can only
                 be added, never removed, and for which the order in
                 which information is added is not observable. LVish
                 provides a Par monad for parallel computation that
                 encapsulates determinism-preserving effects while
                 allowing a more flexible form of communication between
                 parallel tasks than previous guaranteed-deterministic
                 models provided. While applying LVar-based programming
                 to real problems using LVish, we have identified and
                 implemented three capabilities that extend its reach:
                 inflationary updates other than least-upper-bound
                 writes; transitive task cancellation; and parallel
                 mutation of non-overlapping memory locations. The
                 unifying abstraction we use to add these capabilities
                 to LVish---without suffering added complexity or cost
                 in the core LVish implementation, or compromising
                 determinism---is a form of monad transformer, extended
                 to handle the Par monad. With our extensions, LVish
                 provides the most broadly applicable
                 guaranteed-deterministic parallel programming interface
                 available to date. We demonstrate the viability of our
                 approach both with traditional parallel benchmarks and
                 with results from a real-world case study: a
                 bioinformatics application that we parallelized using
                 our extended version of LVish.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Surendran:2014:TDR,
  author =       "Rishi Surendran and Raghavan Raman and Swarat
                 Chaudhuri and John Mellor-Crummey and Vivek Sarkar",
  title =        "Test-driven repair of data races in structured
                 parallel programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "15--25",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594335",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A common workflow for developing parallel software is
                 as follows: (1) start with a sequential program, (2)
                 identify subcomputations that should be converted to
                 parallel tasks, (3) insert synchronization to achieve
                 the same semantics as the sequential program, and
                 repeat steps (2) and (3) as needed to improve
                 performance. Though this is not the only approach to
                 developing parallel software, it is sufficiently common
                 to warrant special attention as parallel programming
                 becomes ubiquitous. This paper focuses on automating
                 step (3), which is usually the hardest step for
                 developers who lack expertise in parallel programming.
                 Past solutions to the problem of repairing parallel
                 programs have used static-only or dynamic-only
                 approaches, both of which incur significant limitations
                 in practice. Static approaches can guarantee soundness
                 in many cases but are limited in precision when
                 analyzing medium or large-scale software with accesses
                 to pointer-based data structures in multiple
                 procedures. Dynamic approaches are more precise, but
                 their proposed repairs are limited to a single input
                 and are not reflected back in the original source
                 program. In this paper, we introduce a hybrid
                 static+dynamic test-driven approach to repairing data
                 races in structured parallel programs. Our approach
                 includes a novel coupling between static and dynamic
                 analyses. First, we execute the program on a concrete
                 test input and determine the set of data races for this
                 input dynamically. Next, we compute a set of ``finish''
                 placements that prevent these races and also respects
                 the static scoping rules of the program while
                 maximizing parallelism. Empirical results on standard
                 benchmarks and student homework submissions from a
                 parallel computing course establish the effectiveness
                 of our approach with respect to compile-time overhead,
                 precision, and performance of the repaired code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Stork:2014:APB,
  author =       "Sven Stork and Karl Naden and Joshua Sunshine and
                 Manual Mohr and Alcides Fonseca and Paulo Marques and
                 Jonathan Aldrich",
  title =        "{{\AE}minium}: a permission based
                 concurrent-by-default programming language approach",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "26--26",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594344",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The aim of {\AE}MINIUM is to study the implications of
                 having a concurrent-by-default programming language.
                 This includes language design, runtime system,
                 performance and software engineering considerations. We
                 conduct our study through the design of the
                 concurrent-by-default {\AE}MINIUM programming language.
                 {\AE}MINIUM leverages the permission flow of object and
                 group permissions through the program to validate the
                 program's correctness and to automatically infer a
                 possible parallelization strategy via a dataflow graph.
                 {\AE}MINIUM supports not only fork-join parallelism but
                 more general dataflow patterns of parallelism. In this
                 paper we present a formal system, called \mu
                 {\AE}MINIUM, modeling the core concepts of {\AE}MINIUM.
                 \mu {\AE}MINIUM's static type system is based on
                 Featherweight Java with {\AE}MINIUM-specific
                 extensions. Besides checking for correctness
                 {\AE}MINIUM's type system it also uses the permission
                 flow to compute a potential parallel execution strategy
                 for the program. \mu {\AE}MINIUM's dynamic semantics
                 use a concurrent-by-default evaluation approach. Along
                 with the formal system we present its soundness proof.
                 We provide a full description of the implementation
                 along with the description of various optimization
                 techniques we used. We implemented {\AE}MINIUM as an
                 extension of the Plaid programming language, which has
                 first-class support for permissions built-in. The
                 {\AE}MINIUM implementation and all case studies are
                 publicly available under the General Public License. We
                 use various case studies to evaluate {\AE}MINIUM's
                 applicability and to demonstrate that {\AE}MINIUM
                 parallelized code has performance improvements compared
                 to its sequential counterpart. We chose to use case
                 studies from common domains or problems that are known
                 to benefit from parallelization, to show that
                 {\AE}MINIUM is powerful enough to encode them. We
                 demonstrate through a webserver application, which
                 evaluates {\AE}MINIUM's impact on latency-bound
                 applications, that {\AE}MINIUM can achieve a 70\%
                 performance improvement over the sequential
                 counterpart. In another case study we chose to
                 implement a dictionary function to evaluate
                 {\AE}MINIUM's capabilities to express essential data
                 structures. Our evaluation demonstrates that
                 {\AE}MINIUM can be used to express parallelism in such
                 data-structures and that the performance benefits scale
                 with the amount of annotation effort which is put into
                 the implementation. We chose an integral
                 computationally example to evaluate pure functional
                 programming and computational intensive use cases. Our
                 experiments show that {\AE}MINIUM is capable of
                 extracting parallelism from functional code and
                 achieving performance improvements up to the limits of
                 Plaid's inherent performance bounds. Overall, we hope
                 that the work helps to advance concurrent programming
                 in modern programming environments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Jagannathan:2014:ARV,
  author =       "Suresh Jagannathan and Vincent Laporte and Gustavo
                 Petri and David Pichardie and Jan Vitek",
  title =        "Atomicity refinement for verified compilation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "27--27",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594346",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We consider the verified compilation of high-level
                 managed languages like Java or C\# whose intermediate
                 representations provide support for shared-memory
                 synchronization and automatic memory management. In
                 this environment, the interactions between application
                 threads and the language runtime (e.g., the garbage
                 collector) are regulated by compiler-injected code
                 snippets. Example of snippets include allocation fast
                 paths among others. In our TOPLAS paper we propose a
                 refinement-based proof methodology that precisely
                 relates concurrent code expressed at different
                 abstraction levels, cognizant throughout of the relaxed
                 memory semantics of the underlying processor. Our
                 technique allows the compiler writer to reason
                 compositionally about the atomicity of low-level
                 concurrent code used to implement managed services. We
                 illustrate our approach with examples taken from the
                 verification of a concurrent garbage collector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Biswas:2014:DES,
  author =       "Swarnendu Biswas and Jipeng Huang and Aritra Sengupta
                 and Michael D. Bond",
  title =        "{DoubleChecker}: efficient sound and precise atomicity
                 checking",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "28--39",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594323",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Atomicity is a key correctness property that allows
                 programmers to reason about code regions in isolation.
                 However, programs often fail to enforce atomicity
                 correctly, leading to atomicity violations that are
                 difficult to detect. Dynamic program analysis can
                 detect atomicity violations based on an atomicity
                 specification, but existing approaches slow programs
                 substantially. This paper presents DoubleChecker, a
                 novel sound and precise atomicity checker whose key
                 insight lies in its use of two new cooperating dynamic
                 analyses. Its imprecise analysis tracks cross-thread
                 dependences soundly but imprecisely with significantly
                 better performance than a fully precise analysis. Its
                 precise analysis is more expensive but only needs to
                 process a subset of the execution identified as
                 potentially involved in atomicity violations by the
                 imprecise analysis. If DoubleChecker operates in
                 single-run mode, the two analyses execute in the same
                 program run, which guarantees soundness and precision
                 but requires logging program accesses to pass from the
                 imprecise to the precise analysis. In multi-run mode,
                 the first program run executes only the imprecise
                 analysis, and a second run executes both analyses.
                 Multi-run mode trades accuracy for performance; each
                 run of multi-run mode outperforms single-run mode, but
                 can potentially miss violations. We have implemented
                 DoubleChecker and an existing state-of-the-art
                 atomicity checker called Velodrome in a
                 high-performance Java virtual machine. DoubleChecker's
                 single-run mode significantly outperforms Velodrome,
                 while still providing full soundness and precision.
                 DoubleChecker's multi-run mode improves performance
                 further, without significantly impacting soundness in
                 practice. These results suggest that DoubleChecker's
                 approach is a promising direction for improving the
                 performance of dynamic atomicity checking over prior
                 work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Alglave:2014:HCM,
  author =       "Jade Alglave and Luc Maranget and Michael Tautschnig",
  title =        "Herding cats: modelling, simulation, testing, and
                 data-mining for weak memory",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "40--40",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594347",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There is a joke where a physicist and a mathematician
                 are asked to herd cats. The physicist starts with an
                 infinitely large pen which he reduces until it is of
                 reasonable diameter yet contains all the cats. The
                 mathematician builds a fence around himself and
                 declares the outside to be the inside. Defining memory
                 models is akin to herding cats: both the physicist's or
                 mathematician's attitudes are tempting, but neither can
                 go without the other.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Rompf:2014:SPJ,
  author =       "Tiark Rompf and Arvind K. Sujeeth and Kevin J. Brown
                 and HyoukJoong Lee and Hassan Chafi and Kunle
                 Olukotun",
  title =        "Surgical precision {JIT} compilers",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "41--52",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594316",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Just-in-time (JIT) compilation of running programs
                 provides more optimization opportunities than offline
                 compilation. Modern JIT compilers, such as those in
                 virtual machines like Oracle's HotSpot for Java or
                 Google's V8 for JavaScript, rely on dynamic profiling
                 as their key mechanism to guide optimizations. While
                 these JIT compilers offer good average performance,
                 their behavior is a black box and the achieved
                 performance is highly unpredictable. In this paper, we
                 propose to turn JIT compilation into a precision tool
                 by adding two essential and generic metaprogramming
                 facilities: First, allow programs to invoke JIT
                 compilation explicitly. This enables controlled
                 specialization of arbitrary code at run-time, in the
                 style of partial evaluation. It also enables the JIT
                 compiler to report warnings and errors to the program
                 when it is unable to compile a code path in the
                 demanded way. Second, allow the JIT compiler to call
                 back into the program to perform compile-time
                 computation. This lets the program itself define the
                 translation strategy for certain constructs on the fly
                 and gives rise to a powerful JIT macro facility that
                 enables ``smart'' libraries to supply domain-specific
                 compiler optimizations or safety checks. We present
                 Lancet, a JIT compiler framework for Java bytecode that
                 enables such a tight, two-way integration with the
                 running program. Lancet itself was derived from a
                 high-level Java bytecode interpreter: staging the
                 interpreter using LMS (Lightweight Modular Staging)
                 produced a simple bytecode compiler. Adding abstract
                 interpretation turned the simple compiler into an
                 optimizing compiler. This fact provides compelling
                 evidence for the scalability of the staged-interpreter
                 approach to compiler construction. In the case of
                 Lancet, JIT macros also provide a natural interface to
                 existing LMS-based toolchains such as the Delite
                 parallelism and DSL framework, which can now serve as
                 accelerator macros for arbitrary JVM bytecode.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Schkufza:2014:SOF,
  author =       "Eric Schkufza and Rahul Sharma and Alex Aiken",
  title =        "Stochastic optimization of floating-point programs
                 with tunable precision",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "53--64",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594302",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The aggressive optimization of floating-point
                 computations is an important problem in
                 high-performance computing. Unfortunately,
                 floating-point instruction sets have complicated
                 semantics that often force compilers to preserve
                 programs as written. We present a method that treats
                 floating-point optimization as a stochastic search
                 problem. We demonstrate the ability to generate reduced
                 precision implementations of Intel's handwritten C
                 numeric library which are up to 6 times faster than the
                 original code, and achieve end-to-end speedups of over
                 30\% on a direct numeric simulation and a ray tracer by
                 optimizing kernels that can tolerate a loss of
                 precision while still remaining correct. Because these
                 optimizations are mostly not amenable to formal
                 verification using the current state of the art, we
                 present a stochastic search technique for
                 characterizing maximum error. The technique comes with
                 an asymptotic guarantee and provides strong evidence of
                 correctness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Stock:2014:FED,
  author =       "Kevin Stock and Martin Kong and Tobias Grosser and
                 Louis-No{\"e}l Pouchet and Fabrice Rastello and J.
                 Ramanujam and P. Sadayappan",
  title =        "A framework for enhancing data reuse via associative
                 reordering",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "65--76",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594342",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The freedom to reorder computations involving
                 associative operators has been widely recognized and
                 exploited in designing parallel algorithms and to a
                 more limited extent in optimizing compilers. In this
                 paper, we develop a novel framework utilizing the
                 associativity and commutativity of operations in
                 regular loop computations to enhance register reuse.
                 Stencils represent a particular class of important
                 computations where the optimization framework can be
                 applied to enhance performance. We show how stencil
                 operations can be implemented to better exploit
                 register reuse and reduce load/stores. We develop a
                 multi-dimensional retiming formalism to characterize
                 the space of valid implementations in conjunction with
                 other program transformations. Experimental results
                 demonstrate the effectiveness of the framework on a
                 collection of high-order stencils.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{DeVito:2014:FCR,
  author =       "Zachary DeVito and Daniel Ritchie and Matt Fisher and
                 Alex Aiken and Pat Hanrahan",
  title =        "First-class runtime generation of high-performance
                 types using exotypes",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "77--88",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594307",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce exotypes, user-defined types that combine
                 the flexibility of meta-object protocols in
                 dynamically-typed languages with the performance
                 control of low-level languages. Like objects in dynamic
                 languages, exotypes are defined programmatically at
                 run-time, allowing behavior based on external data such
                 as a database schema. To achieve high performance, we
                 use staged programming to define the behavior of an
                 exotype during a runtime compilation step and implement
                 exotypes in Terra, a low-level staged programming
                 language. We show how exotype constructors compose, and
                 use exotypes to implement high-performance libraries
                 for serialization, dynamic assembly, automatic
                 differentiation, and probabilistic programming. Each
                 exotype achieves expressiveness similar to libraries
                 written in dynamically-typed languages but implements
                 optimizations that exceed the performance of existing
                 libraries written in low-level statically-typed
                 languages. Though each implementation is significantly
                 shorter, our serialization library is 11 times faster
                 than Kryo, and our dynamic assembler is 3--20 times
                 faster than Google's Chrome assembler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Greenman:2014:GFB,
  author =       "Ben Greenman and Fabian Muehlboeck and Ross Tate",
  title =        "Getting {F}-bounded polymorphism into shape",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "89--99",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594308",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a way to restrict recursive inheritance
                 without sacrificing the benefits of F-bounded
                 polymorphism. In particular, we distinguish two new
                 concepts, materials and shapes, and demonstrate through
                 a survey of 13.5 million lines of open-source
                 generic-Java code that these two concepts never
                 actually overlap in practice. With this Material-Shape
                 Separation, we prove that even na{\"\i}ve type-checking
                 algorithms are sound and complete, some of which
                 address problems that were unsolvable even under the
                 existing proposals for restricting inheritance. We
                 illustrate how the simplicity of our design reflects
                 the design intuitions employed by programmers and
                 potentially enables new features coming into demand for
                 upcoming programming languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Simon:2014:OIF,
  author =       "Axel Simon",
  title =        "Optimal inference of fields in row-polymorphic
                 records",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "100--111",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594313",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Flexible records are a powerful concept in type
                 systems that form the basis of, for instance, objects
                 in dynamically typed languages. One caveat of using
                 flexible records is that a program may try to access a
                 record field that does not exist. We present a type
                 inference algorithm that checks for these runtime
                 errors. The novelty of our algorithm is that it
                 satisfies a clear notion of completeness: The inferred
                 types are optimal in the sense that type annotations
                 cannot increase the set of typeable programs. Under
                 certain assumptions, our algorithm guarantees the
                 following stronger property: it rejects a program if
                 and only if it contains a path from an empty record to
                 a field access on which the field has not been added.
                 We derive this optimal algorithm by abstracting a
                 semantics to types. The derived inference rules use a
                 novel combination of type terms and Boolean functions
                 that retains the simplicity of unification-based type
                 inference but adds the ability of Boolean functions to
                 express implications, thereby addressing the challenge
                 of combining implications and types. By following our
                 derivation method, we show how various operations such
                 as record concatenation and branching if a field exists
                 lead to Boolean satisfiability problems of different
                 complexity. Analogously, we show that more expressive
                 type systems give rise to SMT problems. On the
                 practical side, we present an implementation of the
                 select and update operations and give practical
                 evidence that these are sufficient in real-world
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Sampson:2014:EVP,
  author =       "Adrian Sampson and Pavel Panchekha and Todd Mytkowicz
                 and Kathryn S. McKinley and Dan Grossman and Luis
                 Ceze",
  title =        "Expressing and verifying probabilistic assertions",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "112--122",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594294",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Traditional assertions express correctness properties
                 that must hold on every program execution. However,
                 many applications have probabilistic outcomes and
                 consequently their correctness properties are also
                 probabilistic (e.g., they identify faces in images,
                 consume sensor data, or run on unreliable hardware).
                 Traditional assertions do not capture these correctness
                 properties. This paper proposes that programmers
                 express probabilistic correctness properties with
                 probabilistic assertions and describes a new
                 probabilistic evaluation approach to efficiently verify
                 these assertions. Probabilistic assertions are Boolean
                 expressions that express the probability that a
                 property will be true in a given execution rather than
                 asserting that the property must always be true. Given
                 either specific inputs or distributions on the input
                 space, probabilistic evaluation verifies probabilistic
                 assertions by first performing distribution extraction
                 to represent the program as a Bayesian network.
                 Probabilistic evaluation then uses statistical
                 properties to simplify this representation to
                 efficiently compute assertion probabilities directly or
                 with sampling. Our approach is a mix of both static and
                 dynamic analysis: distribution extraction statically
                 builds and optimizes the Bayesian network
                 representation and sampling dynamically interprets this
                 representation. We implement our approach in a tool
                 called Mayhap for C and C++ programs. We evaluate
                 expressiveness, correctness, and performance of Mayhap
                 on programs that use sensors, perform approximate
                 computation, and obfuscate data for privacy. Our case
                 studies demonstrate that probabilistic assertions
                 describe useful correctness properties and that Mayhap
                 efficiently verifies them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Borges:2014:CSS,
  author =       "Mateus Borges and Antonio Filieri and Marcelo d'Amorim
                 and Corina S. Pasareanu and Willem Visser",
  title =        "Compositional solution space quantification for
                 probabilistic software analysis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "123--132",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594329",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Probabilistic software analysis aims at quantifying
                 how likely a target event is to occur during program
                 execution. Current approaches rely on symbolic
                 execution to identify the conditions to reach the
                 target event and try to quantify the fraction of the
                 input domain satisfying these conditions. Precise
                 quantification is usually limited to linear
                 constraints, while only approximate solutions can be
                 provided in general through statistical approaches.
                 However, statistical approaches may fail to converge to
                 an acceptable accuracy within a reasonable time. We
                 present a compositional statistical approach for the
                 efficient quantification of solution spaces for
                 arbitrarily complex constraints over bounded
                 floating-point domains. The approach leverages interval
                 constraint propagation to improve the accuracy of the
                 estimation by focusing the sampling on the regions of
                 the input domain containing the sought solutions.
                 Preliminary experiments show significant improvement on
                 previous approaches both in results accuracy and
                 analysis time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Hur:2014:SPP,
  author =       "Chung-Kil Hur and Aditya V. Nori and Sriram K.
                 Rajamani and Selva Samuel",
  title =        "Slicing probabilistic programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "133--144",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594303",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Probabilistic programs use familiar notation of
                 programming languages to specify probabilistic models.
                 Suppose we are interested in estimating the
                 distribution of the return expression r of a
                 probabilistic program P. We are interested in slicing
                 the probabilistic program P and obtaining a simpler
                 program Sli( P ) which retains only those parts of P
                 that are relevant to estimating r, and elides those
                 parts of P that are not relevant to estimating r. We
                 desire that the Sli transformation be both correct and
                 efficient. By correct, we mean that P and Sli( P ) have
                 identical estimates on r. By efficient, we mean that
                 estimation over Sli( P ) be as fast as possible. We
                 show that the usual notion of program slicing, which
                 traverses control and data dependencies backward from
                 the return expression r, is unsatisfactory for
                 probabilistic programs, since it produces incorrect
                 slices on some programs and sub-optimal ones on others.
                 Our key insight is that in addition to the usual
                 notions of control dependence and data dependence that
                 are used to slice non-probabilistic programs, a new
                 kind of dependence called observe dependence arises
                 naturally due to observe statements in probabilistic
                 programs. We propose a new definition of Sli( P ) which
                 is both correct and efficient for probabilistic
                 programs, by including observe dependence in addition
                 to control and data dependences for computing slices.
                 We prove correctness mathematically, and we demonstrate
                 efficiency empirically. We show that by applying the
                 Sli transformation as a pre-pass, we can improve the
                 efficiency of probabilistic inference, not only in our
                 own inference tool R2, but also in other systems for
                 performing inference such as Church and Infer.NET.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Cai:2014:TCH,
  author =       "Yufei Cai and Paolo G. Giarrusso and Tillmann Rendel
                 and Klaus Ostermann",
  title =        "A theory of changes for higher-order languages:
                 incrementalizing $ \lambda $-calculi by static
                 differentiation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "145--155",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594304",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "If the result of an expensive computation is
                 invalidated by a small change to the input, the old
                 result should be updated incrementally instead of
                 reexecuting the whole computation. We incrementalize
                 programs through their derivative. A derivative maps
                 changes in the program's input directly to changes in
                 the program's output, without reexecuting the original
                 program. We present a program transformation taking
                 programs to their derivatives, which is fully static
                 and automatic, supports first-class functions, and
                 produces derivatives amenable to standard optimization.
                 We prove the program transformation correct in Agda for
                 a family of simply-typed $ \lambda $-calculi,
                 parameterized by base types and primitives. A precise
                 interface specifies what is required to incrementalize
                 the chosen primitives. We investigate performance by a
                 case study: We implement in Scala the program
                 transformation, a plugin and improve performance of a
                 nontrivial program by orders of magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Hammer:2014:ACD,
  author =       "Matthew A. Hammer and Khoo Yit Phang and Michael Hicks
                 and Jeffrey S. Foster",
  title =        "{Adapton}: composable, demand-driven incremental
                 computation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "156--166",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594324",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many researchers have proposed programming languages
                 that support incremental computation (IC), which allows
                 programs to be efficiently re-executed after a small
                 change to the input. However, existing implementations
                 of such languages have two important drawbacks. First,
                 recomputation is oblivious to specific demands on the
                 program output; that is, if a program input changes,
                 all dependencies will be recomputed, even if an
                 observer no longer requires certain outputs. Second,
                 programs are made incremental as a unit, with little or
                 no support for reusing results outside of their
                 original context, e.g., when reordered. To address
                 these problems, we present $ \lambda_{ic}^{cdd} $, a
                 core calculus that applies a demand-driven semantics to
                 incremental computation, tracking changes in a
                 hierarchical fashion in a novel demanded computation
                 graph. $ \lambda_{ic}^{cdd} $ also formalizes an
                 explicit separation between inner, incremental
                 computations and outer observers. This combination
                 ensures $ \lambda_{ic}^{cdd} $ programs only recompute
                 computations as demanded by observers, and allows inner
                 computations to be reused more liberally. We present
                 Adapton, an OCaml library implementing $
                 \lambda_{ic}^{cdd} $. We evaluated Adapton on a range
                 of benchmarks, and found that it provides reliable
                 speedups, and in many cases dramatically outperforms
                 state-of-the-art IC approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Aung:2014:SS,
  author =       "Min Aung and Susan Horwitz and Rich Joiner and Thomas
                 Reps",
  title =        "Specialization slicing",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "167--167",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594345",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we investigate opportunities to be
                 gained from broadening the definition of program
                 slicing. A major inspiration for our work comes from
                 the field of partial evaluation, in which a wide
                 repertoire of techniques have been developed for
                 specializing programs. While slicing can also be
                 harnessed for specializing programs, the kind of
                 specialization obtainable via slicing has heretofore
                 been quite restricted, compared to the kind of
                 specialization allowed in partial evaluation. In
                 particular, most slicing algorithms are what the
                 partial-evaluation community calls monovariant: each
                 program element of the original program generates at
                 most one element in the answer. In contrast,
                 partial-evaluation algorithms can be polyvariant, i.e.,
                 one program element in the original program may
                 correspond to more than one element in the specialized
                 program. The full paper appears in ACM TOPLAS 36 (2),
                 2014.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Hoare:2014:LCP,
  author =       "Tony Hoare",
  title =        "Laws of concurrent programming",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "168--168",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2604002",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The talk extends the Laws of Programming [1] by four
                 laws governing concurrent composition of programs. This
                 operator is associative and commutative and
                 distributive through union; and it has the same unit
                 (do nothing) as sequential composition. Furthermore,
                 sequential and concurrent composition distribute
                 through each other, in accordance with an exchange law;
                 this permits an implementation of concurrency by
                 partial interleaving.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Sridharan:2014:AEP,
  author =       "Srinath Sridharan and Gagan Gupta and Gurindar S.
                 Sohi",
  title =        "Adaptive, efficient, parallel execution of parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "169--180",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594292",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Future multicore processors will be heterogeneous, be
                 increasingly less reliable, and operate in dynamically
                 changing operating conditions. Such environments will
                 result in a constantly varying pool of hardware
                 resources which can greatly complicate the task of
                 efficiently exposing a program's parallelism onto these
                 resources. Coupled with this uncertainty is the diverse
                 set of efficiency metrics that users may desire. This
                 paper proposes Varuna, a system that dynamically,
                 continuously, rapidly and transparently adapts a
                 program's parallelism to best match the instantaneous
                 capabilities of the hardware resources while satisfying
                 different efficiency metrics. Varuna is applicable to
                 both multithreaded and task-based programs and can be
                 seamlessly inserted between the program and the
                 operating system without needing to change the source
                 code of either. We demonstrate Varuna's effectiveness
                 in diverse execution environments using unaltered C/C++
                 parallel programs from various benchmark suites.
                 Regardless of the execution environment, Varuna always
                 outperformed the state-of-the-art approaches for the
                 efficiency metrics considered.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Gupta:2014:GPR,
  author =       "Gagan Gupta and Srinath Sridharan and Gurindar S.
                 Sohi",
  title =        "Globally precise-restartable execution of parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "181--192",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594306",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging trends in computer design and use are likely
                 to make exceptions, once rare, the norm, especially as
                 the system size grows. Due to exceptions, arising from
                 hardware faults, approximate computing, dynamic
                 resource management, etc., successful and error-free
                 execution of programs may no longer be assured. Yet,
                 designers will want to tolerate the exceptions so that
                 the programs execute completely, efficiently and
                 without external intervention. Modern computers easily
                 handle exceptions in sequential programs, using precise
                 interrupts. But they are ill-equipped to handle
                 exceptions in parallel programs, which are growing in
                 prevalence. In this work we introduce the notion of
                 globally precise-restartable execution of parallel
                 programs, analogous to precise-interruptible execution
                 of sequential programs. We present a software runtime
                 recovery system based on the approach to handle
                 exceptions in suitably-written parallel programs.
                 Qualitative and quantitative analyses show that the
                 proposed system scales with the system size, especially
                 when exceptions are frequent, unlike the conventional
                 checkpoint-and-recovery method.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Mitra:2014:AAP,
  author =       "Subrata Mitra and Ignacio Laguna and Dong H. Ahn and
                 Saurabh Bagchi and Martin Schulz and Todd Gamblin",
  title =        "Accurate application progress analysis for large-scale
                 parallel debugging",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "193--203",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594336",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Debugging large-scale parallel applications is
                 challenging. In most HPC applications, parallel tasks
                 progress in a coordinated fashion, and thus a fault in
                 one task can quickly propagate to other tasks, making
                 it difficult to debug. Finding the least-progressed
                 tasks can significantly reduce the effort to identify
                 the task where the fault originated. However, existing
                 approaches for detecting them suffer low accuracy and
                 large overheads; either they use imprecise static
                 analysis or are unable to infer progress dependence
                 inside loops. We present a loop-aware
                 progress-dependence analysis tool, Prodometer, which
                 determines relative progress among parallel tasks via
                 dynamic analysis. Our fault-injection experiments
                 suggest that its accuracy and precision are over 90\%
                 for most cases and that it scales well up to 16,384 MPI
                 tasks. Further, our case study shows that it
                 significantly helped diagnosing a perplexing error in
                 MPI, which only manifested at large scale.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Tavarageri:2014:CAD,
  author =       "Sanket Tavarageri and Sriram Krishnamoorthy and P.
                 Sadayappan",
  title =        "Compiler-assisted detection of transient memory
                 errors",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "204--215",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594298",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The probability of bit flips in hardware memory
                 systems is projected to increase significantly as
                 memory systems continue to scale in size and
                 complexity. Effective hardware-based error detection
                 and correction require that the complete data path,
                 involving all parts of the memory system, be protected
                 with sufficient redundancy. First, this may be costly
                 to employ on commodity computing platforms, and second,
                 even on high-end systems, protection against multi-bit
                 errors may be lacking. Therefore, augmenting hardware
                 error detection schemes with software techniques is of
                 considerable interest. In this paper, we consider
                 software-level mechanisms to comprehensively detect
                 transient memory faults. We develop novel compile-time
                 algorithms to instrument application programs with
                 checksum computation codes to detect memory errors.
                 Unlike prior approaches that employ checksums on
                 computational and architectural states, our scheme
                 verifies every data access and works by tracking
                 variables as they are produced and consumed.
                 Experimental evaluation demonstrates that the proposed
                 comprehensive error detection solution is viable as a
                 completely software-only scheme. We also demonstrate
                 that with limited hardware support, overheads of error
                 detection can be further reduced.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Le:2014:CVE,
  author =       "Vu Le and Mehrdad Afshari and Zhendong Su",
  title =        "Compiler validation via equivalence modulo inputs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "216--226",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594334",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce equivalence modulo inputs (EMI), a
                 simple, widely applicable methodology for validating
                 optimizing compilers. Our key insight is to exploit the
                 close interplay between (1) dynamically executing a
                 program on some test inputs and (2) statically
                 compiling the program to work on all possible inputs.
                 Indeed, the test inputs induce a natural collection of
                 the original program's EMI variants, which can help
                 differentially test any compiler and specifically
                 target the difficult-to-find miscompilations. To create
                 a practical implementation of EMI for validating C
                 compilers, we profile a program's test executions and
                 stochastically prune its unexecuted code. Our extensive
                 testing in eleven months has led to 147 confirmed,
                 unique bug reports for GCC and LLVM alone. The majority
                 of those bugs are miscompilations, and more than 100
                 have already been fixed. Beyond testing compilers, EMI
                 can be adapted to validate program transformation and
                 analysis systems in general. This work opens up this
                 exciting, new direction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Long:2014:ARE,
  author =       "Fan Long and Stelios Sidiroglou-Douskos and Martin
                 Rinard",
  title =        "Automatic runtime error repair and containment via
                 recovery shepherding",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "227--238",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594337",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a system, RCV, for enabling software
                 applications to survive divide-by-zero and
                 null-dereference errors. RCV operates directly on
                 off-the-shelf, production, stripped x86 binary
                 executables. RCV implements recovery shepherding, which
                 attaches to the application process when an error
                 occurs, repairs the execution, tracks the repair
                 effects as the execution continues, contains the repair
                 effects within the application process, and detaches
                 from the process after all repair effects are flushed
                 from the process state. RCV therefore incurs negligible
                 overhead during the normal execution of the
                 application. We evaluate RCV on all divide-by-zero and
                 null-dereference errors available in the CVE database
                 [2] from January 2011 to March 2013 that (1) provide
                 publicly-available inputs that trigger the error which
                 (2) we were able to use to trigger the reported error
                 in our experimental environment. We collected a total
                 of 18 errors in seven real world applications,
                 Wireshark, the FreeType library, Claws Mail,
                 LibreOffice, GIMP, the PHP interpreter, and Chromium.
                 For 17 of the 18 errors, RCV enables the application to
                 continue to execute to provide acceptable output and
                 service to its users on the error-triggering inputs.
                 For 13 of the 18 errors, the continued RCV execution
                 eventually flushes all of the repair effects and RCV
                 detaches to restore the application to full clean
                 functionality. We perform a manual analysis of the
a                 source code relevant to our benchmark errors, which
                 indicates that for 11 of the 18 errors the RCV and
                 later patched versions produce identical or equivalent
                 results on all inputs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Zhang:2014:ARP,
  author =       "Xin Zhang and Ravi Mangal and Radu Grigore and Mayur
                 Naik and Hongseok Yang",
  title =        "On abstraction refinement for program analyses in
                 {Datalog}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "239--248",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594327",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A central task for a program analysis concerns how to
                 efficiently find a program abstraction that keeps only
                 information relevant for proving properties of
                 interest. We present a new approach for finding such
                 abstractions for program analyses written in Datalog.
                 Our approach is based on counterexample-guided
                 abstraction refinement: when a Datalog analysis run
                 fails using an abstraction, it seeks to generalize the
                 cause of the failure to other abstractions, and pick a
                 new abstraction that avoids a similar failure. Our
                 solution uses a boolean satisfiability formulation that
                 is general, complete, and optimal: it is independent of
                 the Datalog solver, it generalizes the failure of an
                 abstraction to as many other abstractions as possible,
                 and it identifies the cheapest refined abstraction to
                 try next. We show the performance of our approach on a
                 pointer analysis and a typestate analysis, on eight
                 real-world Java benchmark programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Zhang:2014:HTB,
  author =       "Xin Zhang and Ravi Mangal and Mayur Naik and Hongseok
                 Yang",
  title =        "Hybrid top-down and bottom-up interprocedural
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "249--258",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594328",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Interprocedural static analyses are broadly classified
                 into top-down and bottom-up, depending upon how they
                 compute, instantiate, and reuse procedure summaries.
                 Both kinds of analyses are challenging to scale:
                 top-down analyses are hindered by ineffective reuse of
                 summaries whereas bottom-up analyses are hindered by
                 inefficient computation and instantiation of summaries.
                 This paper presents a hybrid approach Swift that
                 combines top-down and bottom-up analyses in a manner
                 that gains their benefits without suffering their
                 drawbacks. Swift is general in that it is parametrized
                 by the top-down and bottom-up analyses it combines. We
                 show an instantiation of Swift on a type-state analysis
                 and evaluate it on a suite of 12 Java programs of size
                 60-250 KLOC each. Swift outperforms both conventional
                 approaches, finishing on all the programs while both of
                 those approaches fail on the larger programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Arzt:2014:FPC,
  author =       "Steven Arzt and Siegfried Rasthofer and Christian
                 Fritz and Eric Bodden and Alexandre Bartel and Jacques
                 Klein and Yves {Le Traon} and Damien Octeau and Patrick
                 McDaniel",
  title =        "{FlowDroid}: precise context, flow, field,
                 object-sensitive and lifecycle-aware taint analysis for
                 {Android} apps",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "259--269",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594299",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today's smartphones are a ubiquitous source of private
                 and confidential data. At the same time, smartphone
                 users are plagued by carelessly programmed apps that
                 leak important data by accident, and by malicious apps
                 that exploit their given privileges to copy such data
                 intentionally. While existing static taint-analysis
                 approaches have the potential of detecting such data
                 leaks ahead of time, all approaches for Android use a
                 number of coarse-grain approximations that can yield
                 high numbers of missed leaks and false alarms. In this
                 work we thus present FlowDroid, a novel and highly
                 precise static taint analysis for Android applications.
                 A precise model of Android's lifecycle allows the
                 analysis to properly handle callbacks invoked by the
                 Android framework, while context, flow, field and
                 object-sensitivity allows the analysis to reduce the
                 number of false alarms. Novel on-demand algorithms help
                 FlowDroid maintain high efficiency and precision at the
                 same time. We also propose DroidBench, an open test
                 suite for evaluating the effectiveness and accuracy of
                 taint-analysis tools specifically for Android apps. As
                 we show through a set of experiments using SecuriBench
                 Micro, DroidBench, and a set of well-known Android test
                 applications, FlowDroid finds a very high fraction of
                 data leaks while keeping the rate of false positives
                 low. On DroidBench, FlowDroid achieves 93\% recall and
                 86\% precision, greatly outperforming the commercial
                 tools IBM AppScan Source and Fortify SCA. FlowDroid
                 successfully finds leaks in a subset of 500 apps from
                 Google Play and about 1,000 malware apps from the
                 VirusShare project.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Carbonneaux:2014:EEV,
  author =       "Quentin Carbonneaux and Jan Hoffmann and Tahina
                 Ramananandro and Zhong Shao",
  title =        "End-to-end verification of stack-space bounds for {C}
                 programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "270--281",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594301",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Verified compilers guarantee the preservation of
                 semantic properties and thus enable formal verification
                 of programs at the source level. However, important
                 quantitative properties such as memory and time usage
                 still have to be verified at the machine level where
                 interactive proofs tend to be more tedious and
                 automation is more challenging. This article describes
                 a framework that enables the formal verification of
                 stack-space bounds of compiled machine code at the C
                 level. It consists of a verified CompCert-based
                 compiler that preserves quantitative properties, a
                 verified quantitative program logic for interactive
                 stack-bound development, and a verified stack analyzer
                 that automatically derives stack bounds during
                 compilation. The framework is based on event traces
                 that record function calls and returns. The source
                 language is CompCert Clight and the target language is
                 x86 assembly. The compiler is implemented in the Coq
                 Proof Assistant and it is proved that crucial
                 properties of event traces are preserved during
                 compilation. A novel quantitative Hoare logic is
                 developed to verify stack-space bounds at the CompCert
                 Clight level. The quantitative logic is implemented in
                 Coq and proved sound with respect to event traces
                 generated by the small-step semantics of CompCert
                 Clight. Stack-space bounds can be proved at the source
                 level without taking into account low-level details
                 that depend on the implementation of the compiler. The
                 compiler fills in these low-level details during
                 compilation and generates a concrete stack-space bound
                 that applies to the produced machine code. The verified
                 stack analyzer is guaranteed to automatically derive
                 bounds for code with non-recursive functions. It
                 generates a derivation in the quantitative logic to
                 ensure soundness as well as interoperability with
                 interactively developed stack bounds. In an
                 experimental evaluation, the developed framework is
                 used to obtain verified stack-space bounds for micro
                 benchmarks as well as real system code. The examples
                 include the verified operating-system kernel CertiKOS,
                 parts of the MiBench embedded benchmark suite, and
                 programs from the CompCert benchmarks. The derived
                 bounds are close to the measured stack-space usage of
                 executions of the compiled programs on a Linux x86
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Ball:2014:VTV,
  author =       "Thomas Ball and Nikolaj Bj{\o}rner and Aaron Gember
                 and Shachar Itzhaky and Aleksandr Karbyshev and Mooly
                 Sagiv and Michael Schapira and Asaf Valadarsky",
  title =        "{VeriCon}: towards verifying controller programs in
                 software-defined networks",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "282--293",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594317",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software-defined networking (SDN) is a new paradigm
                 for operating and managing computer networks. SDN
                 enables logically-centralized control over network
                 devices through a ``controller'' software that operates
                 independently from the network hardware, and can be
                 viewed as the network operating system. Network
                 operators can run both inhouse and third-party SDN
                 programs (often called applications) on top of the
                 controller, e.g., to specify routing and access control
                 policies. SDN opens up the possibility of applying
                 formal methods to prove the correctness of computer
                 networks. Indeed, recently much effort has been
                 invested in applying finite state model checking to
                 check that SDN programs behave correctly. However, in
                 general, scaling these methods to large networks is
                 challenging and, moreover, they cannot guarantee the
                 absence of errors. We present VeriCon, the first system
                 for verifying that an SDN program is correct on all
                 admissible topologies and for all possible (infinite)
                 sequences of network events. VeriCon either confirms
                 the correctness of the controller program on all
                 admissible network topologies or outputs a concrete
                 counterexample. VeriCon uses first-order logic to
                 specify admissible network topologies and desired
                 network-wide invariants, and then implements classical
                 Floyd-Hoare-Dijkstra deductive verification using Z3.
                 Our preliminary experience indicates that VeriCon is
                 able to rapidly verify correctness, or identify bugs,
                 for a large repertoire of simple core SDN programs.
                 VeriCon is compositional, in the sense that it verifies
                 the correctness of execution of any single network
                 event w.r.t. the specified invariant, and can thus
                 scale to handle large programs. To relieve the burden
                 of specifying inductive invariants from the programmer,
                 VeriCon includes a separate procedure for inferring
                 invariants, which is shown to be effective on simple
                 controller programs. We view VeriCon as a first step en
                 route to practical mechanisms for verifying
                 network-wide invariants of SDN programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Logozzo:2014:VMV,
  author =       "Francesco Logozzo and Shuvendu K. Lahiri and Manuel
                 F{\"a}hndrich and Sam Blackshear",
  title =        "Verification modulo versions: towards usable
                 verification",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "294--304",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594326",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce Verification Modulo Versions (VMV), a new
                 static analysis technique for reducing the number of
                 alarms reported by static verifiers while providing
                 sound semantic guarantees. First, VMV extracts semantic
                 environment conditions from a base program P.
                 Environmental conditions can either be sufficient
                 conditions (implying the safety of P) or necessary
                 conditions (implied by the safety of P). Then, VMV
                 instruments a new version of the program, P', with the
                 inferred conditions. We prove that we can use (i)
                 sufficient conditions to identify abstract regressions
                 of P' w.r.t. P; and (ii) necessary conditions to prove
                 the relative correctness of P' w.r.t. P. We show that
                 the extraction of environmental conditions can be
                 performed at a hierarchy of abstraction levels
                 (history, state, or call conditions) with each
                 subsequent level requiring a less sophisticated
                 matching of the syntactic changes between P' and P.
                 Call conditions are particularly useful because they
                 only require the syntactic matching of entry points and
                 callee names across program versions. We have
                 implemented VMV in a widely used static analysis and
                 verification tool. We report our experience on two
                 large code bases and demonstrate a substantial
                 reduction in alarms while additionally providing
                 relative correctness guarantees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Dimitrov:2014:CRD,
  author =       "Dimitar Dimitrov and Veselin Raychev and Martin Vechev
                 and Eric Koskinen",
  title =        "Commutativity race detection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "305--315",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594322",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces the concept of a commutativity
                 race. A commutativity race occurs in a given execution
                 when two library method invocations can happen
                 concurrently yet they do not commute. Commutativity
                 races are an elegant concept enabling reasoning about
                 concurrent interaction at the library interface. We
                 present a dynamic commutativity race detector. Our
                 technique is based on a novel combination of vector
                 clocks and a structural representation automatically
                 obtained from a commutativity specification.
                 Conceptually, our work can be seen as generalizing
                 classical read-write race detection. We also present a
                 new logical fragment for specifying commutativity
                 conditions. This fragment is expressive, yet guarantees
                 a constant number of comparisons per method invocation
                 rather than linear with unrestricted specifications. We
                 implemented our analyzer and evaluated it on real-world
                 applications. Experimental results indicate that our
                 analysis is practical: it discovered harmful
                 commutativity races with overhead comparable to
                 state-of-the-art, low-level race detectors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Maiya:2014:RDA,
  author =       "Pallavi Maiya and Aditya Kanade and Rupak Majumdar",
  title =        "Race detection for {Android} applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "316--325",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594311",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming environments for smartphones expose a
                 concurrency model that combines multi-threading and
                 asynchronous event-based dispatch. While this enables
                 the development of efficient and feature-rich
                 applications, unforeseen thread interleavings coupled
                 with non-deterministic reorderings of asynchronous
                 tasks can lead to subtle concurrency errors in the
                 applications. In this paper, we formalize the
                 concurrency semantics of the Android programming model.
                 We further define the happens-before relation for
                 Android applications, and develop a dynamic race
                 detection technique based on this relation. Our
                 relation generalizes the so far independently studied
                 happens-before relations for multi-threaded programs
                 and single-threaded event-driven programs.
                 Additionally, our race detection technique uses a model
                 of the Android runtime environment to reduce false
                 positives. We have implemented a tool called
                 DroidRacer. It generates execution traces by
                 systematically testing Android applications and detects
                 data races by computing the happens-before relation on
                 the traces. We analyzed 15 Android applications
                 including popular applications such as Facebook,
                 Twitter and K-9 Mail. Our results indicate that data
                 races are prevalent in Android applications, and that
                 DroidRacer is an effective tool to identify data
                 races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Hsiao:2014:RDE,
  author =       "Chun-Hung Hsiao and Jie Yu and Satish Narayanasamy and
                 Ziyun Kong and Cristiano L. Pereira and Gilles A. Pokam
                 and Peter M. Chen and Jason Flinn",
  title =        "Race detection for event-driven mobile applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "326--336",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594330",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mobile systems commonly support an event-based model
                 of concurrent programming. This model, used in popular
                 platforms such as Android, naturally supports mobile
                 devices that have a rich array of sensors and user
                 input modalities. Unfortunately, most existing tools
                 for detecting concurrency errors of parallel programs
                 focus on a thread-based model of concurrency. If one
                 applies such tools directly to an event-based program,
                 they work poorly because they infer false dependencies
                 between unrelated events handled sequentially by the
                 same thread. In this paper we present a race detection
                 tool named CAFA for event-driven mobile systems. CAFA
                 uses the causality model that we have developed for the
                 Android event-driven system. A novel contribution of
                 our model is that it accounts for the causal order due
                 to the event queues, which are not accounted for in
                 past data race detectors. Detecting races based on
                 low-level races between memory accesses leads to a
                 large number of false positives. CAFA overcomes this
                 problem by checking for races between high-level
                 operations. We discuss our experience in using CAFA for
                 finding and understanding a number of known and unknown
                 harmful races in open-source Android applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Huang:2014:MSP,
  author =       "Jeff Huang and Patrick O'Neil Meredith and Grigore
                 Rosu",
  title =        "Maximal sound predictive race detection with control
                 flow abstraction",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "337--348",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594315",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the numerous static and dynamic program
                 analysis techniques in the literature, data races
                 remain one of the most common bugs in modern concurrent
                 software. Further, the techniques that do exist either
                 have limited detection capability or are unsound,
                 meaning that they report false positives. We present a
                 sound race detection technique that achieves a provably
                 higher detection capability than existing sound
                 techniques. A key insight of our technique is the
                 inclusion of abstracted control flow information into
                 the execution model, which increases the space of the
                 causal model permitted by classical happens-before or
                 causally-precedes based detectors. By encoding the
                 control flow and a minimal set of feasibility
                 constraints as a group of first-order logic formulae,
                 we formulate race detection as a constraint solving
                 problem. Moreover, we formally prove that our
                 formulation achieves the maximal possible detection
                 capability for any sound dynamic race detector with
                 respect to the same input trace under the sequential
                 consistency memory model. We demonstrate via extensive
                 experimentation that our technique detects more races
                 than the other state-of-the-art sound race detection
                 techniques, and that it is scalable to executions of
                 real world concurrent applications with tens of
                 millions of critical events. These experiments also
                 revealed several previously unknown races in real
                 systems (e.g., Eclipse) that have been confirmed or
                 fixed by the developers. Our tool is also adopted by
                 Eclipse developers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{David:2014:TBC,
  author =       "Yaniv David and Eran Yahav",
  title =        "Tracelet-based code search in executables",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "349--360",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594343",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We address the problem of code search in executables.
                 Given a function in binary form and a large code base,
                 our goal is to statically find similar functions in the
                 code base. Towards this end, we present a novel
                 technique for computing similarity between functions.
                 Our notion of similarity is based on decomposition of
                 functions into tracelets: continuous, short, partial
                 traces of an execution. To establish tracelet
                 similarity in the face of low-level compiler
                 transformations, we employ a simple rewriting engine.
                 This engine uses constraint solving over alignment
                 constraints and data dependencies to match registers
                 and memory addresses between tracelets, bridging the
                 gap between tracelets that are otherwise similar. We
                 have implemented our approach and applied it to find
                 matches in over a million binary functions. We compare
                 tracelet matching to approaches based on n-grams and
                 graphlets and show that tracelet matching obtains
                 dramatically better precision and recall.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Pombrio:2014:RLE,
  author =       "Justin Pombrio and Shriram Krishnamurthi",
  title =        "Resugaring: lifting evaluation sequences through
                 syntactic sugar",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "361--371",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594319",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Syntactic sugar is pervasive in language technology.
                 It is used to shrink the size of a core language; to
                 define domain-specific languages; and even to let
                 programmers extend their language. Unfortunately,
                 syntactic sugar is eliminated by transformation, so the
                 resulting programs become unfamiliar to authors. Thus,
                 it comes at a price: it obscures the relationship
                 between the user's source program and the program being
                 evaluated. We address this problem by showing how to
                 compute reduction steps in terms of the surface syntax.
                 Each step in the surface language emulates one or more
                 steps in the core language. The computed steps hide the
                 transformation, thus maintaining the abstraction
                 provided by the surface language. We make these
                 statements about emulation and abstraction precise,
                 prove that they hold in our formalism, and verify part
                 of the system in Coq. We have implemented this work and
                 applied it to three very different languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{vonHanxleden:2014:SSC,
  author =       "Reinhard von Hanxleden and Bj{\"o}rn Duderstadt and
                 Christian Motika and Steven Smyth and Michael Mendler
                 and Joaqu{\'\i}n Aguado and Stephen Mercer and Owen
                 O'Brien",
  title =        "{SCCharts}: sequentially constructive statecharts for
                 safety-critical applications: {HW\slash SW}-synthesis
                 for a conservative extension of synchronous
                 statecharts",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "372--383",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594310",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new visual language, SCCharts, designed
                 for specifying safety-critical reactive systems.
                 SCCharts use a statechart notation and provide
                 determinate concurrency based on a synchronous model of
                 computation (MoC), without restrictions common to
                 previous synchronous MoCs. Specifically, we lift
                 earlier limitations on sequential accesses to shared
                 variables, by leveraging the sequentially constructive
                 MoC. The semantics and key features of SCCharts are
                 defined by a very small set of elements, the Core
                 SCCharts, consisting of state machines plus fork/join
                 concurrency. We also present a compilation chain that
                 allows efficient synthesis of software and hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{DAntoni:2014:FTB,
  author =       "Loris D'Antoni and Margus Veanes and Benjamin Livshits
                 and David Molnar",
  title =        "{Fast}: a transducer-based language for tree
                 manipulation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "384--394",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594309",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Tree automata and tree transducers are used in a wide
                 range of applications in software engineering, from XML
                 processing to language type-checking. While these
                 formalisms are of immense practical use, they can only
                 model finite alphabets, and since many real-world
                 applications operate over infinite domains such as
                 integers, this is often a limitation. To overcome this
                 problem we augment tree automata and transducers with
                 symbolic alphabets represented as parametric theories.
                 Admitting infinite alphabets makes these models more
                 general and succinct than their classical counterparts.
                 Despite this, we show how the main operations, such as
                 composition and language equivalence, remain computable
                 given a decision procedure for the alphabet theory. We
                 introduce a high-level language called Fast that acts
                 as a front-end for the above formalisms. Fast supports
                 symbolic alphabets through tight integration with
                 state-of-the-art satisfiability modulo theory (SMT)
                 solvers. We demonstrate our techniques on practical
                 case studies, covering a wide range of applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Misra:2014:PPC,
  author =       "Jayadev Misra",
  title =        "A personal perspective on concurrency",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "395--395",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2604003",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This talk will describe a view of concurrency, the
                 author's own, as it has evolved since the late 1970s.
                 Early notions of concurrency were intimately tied with
                 physical hardware and speeding up of computations,
                 which proved to be an impediment to the development of
                 a logical theory of concurrency. In collaboration with
                 K. Mani Chandy, the author developed a theory called
                 UNITY that combined a programming notation with a
                 verification logic to describe a large class of
                 fundamental concurrent algorithms arising in operating
                 systems, communication protocols and distributed
                 systems. Several model checkers, including Murphi,
                 developed by David Dill, are based on UNITY. A
                 limitation of UNITY was a lack of adequate structuring
                 mechanism. While this was not a major problem in
                 low-level applications, the current wide-spread use of
                 concurrency requires theories that go beyond managing
                 infrastructure to the level of massive applications.
                 Our current research, a programming model called Orc,
                 introduces mechanisms to organize the communication,
                 synchronization and coordination in programs that run
                 on wide-area networks. Orc includes constructs to
                 orchestrate the concurrent invocation of services to
                 achieve a goal --- while managing time-outs,
                 priorities, and failure of sites or communication.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Phothilimthana:2014:CSA,
  author =       "Phitchaya Mangpo Phothilimthana and Tikhon Jelvis and
                 Rohin Shah and Nishant Totla and Sarah Chasins and
                 Rastislav Bodik",
  title =        "{Chlorophyll}: synthesis-aided compiler for low-power
                 spatial architectures",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "396--407",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594339",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We developed Chlorophyll, a synthesis-aided
                 programming model and compiler for the GreenArrays
                 GA144, an extremely minimalist low-power spatial
                 architecture that requires partitioning the program
                 into fragments of no more than 256 instructions and 64
                 words of data. This processor is 100-times more energy
                 efficient than its competitors, but currently can only
                 be programmed using a low-level stack-based language.
                 The Chlorophyll programming model allows programmers to
                 provide human insight by specifying partial
                 partitioning of data and computation. The Chlorophyll
                 compiler relies on synthesis, sidestepping the need to
                 develop classical optimizations, which may be
                 challenging given the unusual architecture. To scale
                 synthesis to real problems, we decompose the
                 compilation into smaller synthesis
                 subproblems---partitioning, layout, and code
                 generation. We show that the synthesized programs are
                 no more than 65\% slower than highly optimized
                 expert-written programs and are faster than programs
                 produced by a heuristic, non-synthesizing version of
                 our compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Perelman:2014:TDS,
  author =       "Daniel Perelman and Sumit Gulwani and Dan Grossman and
                 Peter Provost",
  title =        "Test-driven synthesis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "408--418",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594297",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming-by-example technologies empower end-users
                 to create simple programs merely by providing
                 input/output examples. Existing systems are designed
                 around solvers specialized for a specific set of data
                 types or domain-specific language (DSL). We present a
                 program synthesizer which can be parameterized by an
                 arbitrary DSL that may contain conditionals and loops
                 and therefore is able to synthesize programs in any
                 domain. In order to use our synthesizer, the user
                 provides a sequence of increasingly sophisticated
                 input/output examples along with an expert-written DSL
                 definition. These two inputs correspond to the two key
                 ideas that allow our synthesizer to work in arbitrary
                 domains. First, we developed a novel iterative
                 synthesis technique inspired by test-driven
                 development---which also gives our technique the name
                 of test-driven synthesis ---where the input/output
                 examples are consumed one at a time as the program is
                 refined. Second, the DSL allows our system to take an
                 efficient component-based approach to enumerating
                 possible programs. We present applications of our
                 synthesis methodology to end-user programming for
                 transformations over strings, XML, and table layouts.
                 We compare our synthesizer on these applications to
                 state-of-the-art DSL-specific synthesizers as well to
                 the general purpose synthesizer Sketch.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Raychev:2014:CCS,
  author =       "Veselin Raychev and Martin Vechev and Eran Yahav",
  title =        "Code completion with statistical language models",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "419--428",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594321",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We address the problem of synthesizing code
                 completions for programs using APIs. Given a program
                 with holes, we synthesize completions for holes with
                 the most likely sequences of method calls. Our main
                 idea is to reduce the problem of code completion to a
                 natural-language processing problem of predicting
                 probabilities of sentences. We design a simple and
                 scalable static analysis that extracts sequences of
                 method calls from a large codebase, and index these
                 into a statistical language model. We then employ the
                 language model to find the highest ranked sentences,
                 and use them to synthesize a code completion. Our
                 approach is able to synthesize sequences of calls
                 across multiple objects together with their arguments.
                 Experiments show that our approach is fast and
                 effective. Virtually all computed completions
                 typecheck, and the desired completion appears in the
                 top 3 results in 90\% of the cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Greenaway:2014:DSS,
  author =       "David Greenaway and Japheth Lim and June Andronick and
                 Gerwin Klein",
  title =        "Don't sweat the small stuff: formal verification of
                 {C} code without the pain",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "429--439",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594296",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an approach for automatically generating
                 provably correct abstractions from C source code that
                 are useful for practical implementation verification.
                 The abstractions are easier for a human verification
                 engineer to reason about than the implementation and
                 increase the productivity of interactive code proof. We
                 guarantee soundness by automatically generating proofs
                 that the abstractions are correct. In particular, we
                 show two key abstractions that are critical for
                 verifying systems-level C code: automatically turning
                 potentially overflowing machine-word arithmetic into
                 ideal integers, and transforming low-level C pointer
                 reasoning into separate abstract heaps. Previous work
                 carrying out such transformations has either done so
                 using unverified translations, or required significant
                 proof engineering effort. We implement these
                 abstractions in an existing proof-producing
                 specification transformation framework named
                 AutoCorres, developed in Isabelle/HOL, and demonstrate
                 its effectiveness in a number of case studies. We show
                 scalability on multiple OS microkernels, and we show
                 how our changes to AutoCorres improve productivity for
                 total correctness by porting an existing high-level
                 verification of the Schorr--Waite algorithm to a
                 low-level C implementation with minimal effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Pek:2014:NPD,
  author =       "Edgar Pek and Xiaokang Qiu and P. Madhusudan",
  title =        "Natural proofs for data structure manipulation in {C}
                 using separation logic",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "440--451",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594325",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The natural proof technique for heap verification
                 developed by Qiu et al. [32] provides a platform for
                 powerful sound reasoning for specifications written in
                 a dialect of separation logic called Dryad. Natural
                 proofs are proof tactics that enable automated
                 reasoning exploiting recursion, mimicking common
                 patterns found in human proofs. However, these proofs
                 are known to work only for a simple toy language [32].
                 In this work, we develop a framework called VCDryad
                 that extends the Vcc framework [9] to provide an
                 automated deductive framework against separation logic
                 specifications for C programs based on natural proofs.
                 We develop several new techniques to build this
                 framework, including (a) a novel tool architecture that
                 allows encoding natural proofs at a higher level in
                 order to use the existing Vcc framework (including its
                 intricate memory model, the underlying type-checker,
                 and the SMT-based verification infrastructure), and (b)
                 a synthesis of ghost-code annotations that captures
                 natural proof tactics, in essence forcing Vcc to find
                 natural proofs using primarily decidable theories. We
                 evaluate our tool extensively, on more than 150
                 programs, ranging from code manipulating standard data
                 structures, well-known open source library routines
                 (Glib, OpenBSD), Linux kernel routines, customized OS
                 data structures, etc. We show that all these C programs
                 can be fully automatically verified using natural
                 proofs (given pre/post conditions and loop invariants)
                 without any user-provided proof tactics. VCDryad is
                 perhaps the first deductive verification framework for
                 heap-manipulating programs in a real language that can
                 prove such a wide variety of programs automatically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Ricketts:2014:AFP,
  author =       "Daniel Ricketts and Valentin Robert and Dongseok Jang
                 and Zachary Tatlock and Sorin Lerner",
  title =        "Automating formal proofs for reactive systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "452--462",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594338",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Implementing systems in proof assistants like Coq and
                 proving their correctness in full formal detail has
                 consistently demonstrated promise for making extremely
                 strong guarantees about critical software, ranging from
                 compilers and operating systems to databases and web
                 browsers. Unfortunately, these verifications demand
                 such heroic manual proof effort, even for a single
                 system, that the approach has not been widely adopted.
                 We demonstrate a technique to eliminate the manual
                 proof burden for verifying many properties within an
                 entire class of applications, in our case reactive
                 systems, while only expending effort comparable to the
                 manual verification of a single system. A crucial
                 insight of our approach is simultaneously designing
                 both (1) a domain-specific language (DSL) for
                 expressing reactive systems and their correctness
                 properties and (2) proof automation which exploits the
                 constrained language of both programs and properties to
                 enable fully automatic, pushbutton verification. We
                 apply this insight in a deeply embedded Coq DSL, dubbed
                 Reflex, and illustrate Reflex's expressiveness by
                 implementing and automatically verifying realistic
                 systems including a modern web browser, an SSH server,
                 and a web server. Using Reflex radically reduced the
                 proof burden: in previous, similar versions of our
                 benchmarks written in Coq by experts, proofs accounted
                 for over 80\% of the code base; our versions require no
                 manual proofs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Xiao:2014:PPI,
  author =       "Xiao Xiao and Qirun Zhang and Jinguo Zhou and Charles
                 Zhang",
  title =        "Persistent pointer information",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "463--474",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594314",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Pointer information, indispensable for static analysis
                 tools, is expensive to compute and query. We provide a
                 query-efficient persistence technique, Pestrie, to
                 mitigate the costly computation and slow querying of
                 precise pointer information. Leveraging equivalence and
                 hub properties, Pestrie can compress pointer
                 information and answers pointer related queries very
                 efficiently. The experiment shows that Pestrie produces
                 10.5X and 17.5X smaller persistent files than the
                 traditional bitmap and BDD encodings. Meanwhile,
                 Pestrie is 2.9X to 123.6X faster than traditional
                 demand-driven approaches for serving points-to related
                 queries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Oh:2014:SCS,
  author =       "Hakjoo Oh and Wonchan Lee and Kihong Heo and Hongseok
                 Yang and Kwangkeun Yi",
  title =        "Selective context-sensitivity guided by impact
                 pre-analysis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "475--484",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594318",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a method for selectively applying
                 context-sensitivity during interprocedural program
                 analysis. Our method applies context-sensitivity only
                 when and where doing so is likely to improve the
                 precision that matters for resolving given queries. The
                 idea is to use a pre-analysis to estimate the impact of
                 context-sensitivity on the main analysis's precision,
                 and to use this information to find out when and where
                 the main analysis should turn on or off its
                 context-sensitivity. We formalize this approach and
                 prove that the analysis always benefits from the
                 pre-analysis-guided context-sensitivity. We implemented
                 this selective method for an existing
                 industrial-strength interval analyzer for full C. The
                 method reduced the number of (false) alarms by 24.4\%,
                 while increasing the analysis cost by 27.8\% on
                 average. The use of the selective method is not limited
                 to context-sensitivity. We demonstrate this generality
                 by following the same principle and developing a
                 selective relational analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Smaragdakis:2014:IAC,
  author =       "Yannis Smaragdakis and George Kastrinis and George
                 Balatsouras",
  title =        "Introspective analysis: context-sensitivity, across
                 the board",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "485--495",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594320",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Context-sensitivity is the primary approach for adding
                 more precision to a points-to analysis, while hopefully
                 also maintaining scalability. An oft-reported problem
                 with context-sensitive analyses, however, is that they
                 are bi-modal: either the analysis is precise enough
                 that it manipulates only manageable sets of data, and
                 thus scales impressively well, or the analysis gets
                 quickly derailed at the first sign of imprecision and
                 becomes orders-of-magnitude more expensive than would
                 be expected given the program's size. There is
                 currently no approach that makes precise
                 context-sensitive analyses (of any flavor: call-site-,
                 object-, or type-sensitive) scale across the board at a
                 level comparable to that of a context-insensitive
                 analysis. To address this issue, we propose
                 introspective analysis: a technique for uniformly
                 scaling context-sensitive analysis by eliminating its
                 performance-detrimental behavior, at a small precision
                 expense. Introspective analysis consists of a common
                 adaptivity pattern: first perform a context-insensitive
                 analysis, then use the results to selectively refine
                 (i.e., analyze context-sensitively) program elements
                 that will not cause explosion in the running time or
                 space. The technical challenge is to appropriately
                 identify such program elements. We show that a simple
                 but principled approach can be remarkably effective,
                 achieving scalability (often with dramatic speedup) for
                 benchmarks previously completely out-of-reach for deep
                 context-sensitive analyses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Ahn:2014:IJP,
  author =       "Wonsun Ahn and Jiho Choi and Thomas Shull and
                 Mar{\'\i}a J. Garzar{\'a}n and Josep Torrellas",
  title =        "Improving {JavaScript} performance by deconstructing
                 the type system",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "496--507",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594332",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Increased focus on JavaScript performance has resulted
                 in vast performance improvements for many benchmarks.
                 However, for actual code used in websites, the attained
                 improvements often lag far behind those for popular
                 benchmarks. This paper shows that the main reason
                 behind this short-fall is how the compiler understands
                 types. JavaScript has no concept of types, but the
                 compiler assigns types to objects anyway for ease of
                 code generation. We examine the way that the Chrome V8
                 compiler defines types, and identify two design
                 decisions that are the main reasons for the lack of
                 improvement: (1) the inherited prototype object is part
                 of the current object's type definition, and (2) method
                 bindings are also part of the type definition. These
                 requirements make types very unpredictable, which
                 hinders type specialization by the compiler. Hence, we
                 modify V8 to remove these requirements, and use it to
                 compile the JavaScript code assembled by JSBench from
                 real websites. On average, we reduce the execution time
                 of JSBench by 36\%, and the dynamic instruction count
                 by 49\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Vilk:2014:DBB,
  author =       "John Vilk and Emery D. Berger",
  title =        "{Doppio}: breaking the browser language barrier",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "508--518",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594293",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Web browsers have become a de facto universal
                 operating system, and JavaScript its instruction set.
                 Unfortunately, running other languages in the browser
                 is not generally possible. Translation to JavaScript is
                 not enough because browsers are a hostile environment
                 for other languages. Previous approaches are either
                 non-portable or require extensive modifications for
                 programs to work in a browser. This paper presents
                 Doppio, a JavaScript-based runtime system that makes it
                 possible to run unaltered applications written in
                 general-purpose languages directly inside the browser.
                 Doppio provides a wide range of runtime services,
                 including a file system that enables local and external
                 (cloud-based) storage, an unmanaged heap, sockets,
                 blocking I/O, and multiple threads. We demonstrate
                 DOPPIO's usefulness with two case studies: we extend
                 Emscripten with Doppio, letting it run an unmodified
                 C++ application in the browser with full functionality,
                 and present DoppioJVM, an interpreter that runs
                 unmodified JVM programs directly in the browser. While
                 substantially slower than a native JVM (between 24X and
                 42X slower on CPU-intensive benchmarks in Google
                 Chrome), DoppioJVM makes it feasible to directly reuse
                 existing, non compute-intensive code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Lu:2014:DED,
  author =       "Li Lu and Weixing Ji and Michael L. Scott",
  title =        "Dynamic enforcement of determinism in a parallel
                 scripting language",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "519--529",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594300",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Determinism is an appealing property for parallel
                 programs, as it simplifies understanding, reasoning and
                 debugging. It is particularly appealing in dynamic
                 (scripting) languages, where ease of programming is a
                 dominant design goal. Some existing parallel languages
                 use the type system to enforce determinism statically,
                 but this is not generally practical for dynamic
                 languages. In this paper, we describe how determinism
                 can be obtained---and dynamically
                 enforced/verified---for appropriate extensions to a
                 parallel scripting language. Specifically, we introduce
                 the constructs of Deterministic Parallel Ruby (DPR),
                 together with a run-time system (Tardis) that verifies
                 properties required for determinism, including correct
                 usage of reductions and commutative operators, and the
                 mutual independence (data-race freedom) of concurrent
                 tasks. Experimental results confirm that DPR can
                 provide scalable performance on multicore machines and
                 that the overhead of Tardis is low enough for practical
                 testing. In particular, Tardis significantly
                 outperforms alternative data-race detectors with
                 comparable functionality. We conclude with a discussion
                 of future directions in the dynamic enforcement of
                 determinism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Torlak:2014:LSV,
  author =       "Emina Torlak and Rastislav Bodik",
  title =        "A lightweight symbolic virtual machine for
                 solver-aided host languages",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "530--541",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594340",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Solver-aided domain-specific languages (SDSLs) are an
                 emerging class of computer-aided programming systems.
                 They ease the construction of programs by using
                 satisfiability solvers to automate tasks such as
                 verification, debugging, synthesis, and
                 non-deterministic execution. But reducing programming
                 tasks to satisfiability problems involves translating
                 programs to logical constraints, which is an
                 engineering challenge even for domain-specific
                 languages. We have previously shown that translation to
                 constraints can be avoided if SDSLs are implemented by
                 (traditional) embedding into a host language that is
                 itself solver-aided. This paper describes how to
                 implement a symbolic virtual machine (SVM) for such a
                 host language. Our symbolic virtual machine is
                 lightweight because it compiles to constraints only a
                 small subset of the host's constructs, while allowing
                 SDSL designers to use the entire language, including
                 constructs for DSL embedding. This lightweight
                 compilation employs a novel symbolic execution
                 technique with two key properties: it produces compact
                 encodings, and it enables concrete evaluation to strip
                 away host constructs that are outside the subset
                 compilable to constraints. Our symbolic virtual machine
                 architecture is at the heart of Rosette, a solver-aided
                 language that is host to several new SDSLs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Le:2014:FFD,
  author =       "Vu Le and Sumit Gulwani",
  title =        "{FlashExtract}: a framework for data extraction by
                 examples",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "542--553",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594333",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Various document types that combine model and view
                 (e.g., text files, webpages, spreadsheets) make it easy
                 to organize (possibly hierarchical) data, but make it
                 difficult to extract raw data for any further
                 manipulation or querying. We present a general
                 framework FlashExtract to extract relevant data from
                 semi-structured documents using examples. It includes:
                 (a) an interaction model that allows end-users to give
                 examples to extract various fields and to relate them
                 in a hierarchical organization using structure and
                 sequence constructs. (b) an inductive synthesis
                 algorithm to synthesize the intended program from few
                 examples in any underlying domain-specific language for
                 data extraction that has been built using our specified
                 algebra of few core operators (map, filter, merge, and
                 pair). We describe instantiation of our framework to
                 three different domains: text files, webpages, and
                 spreadsheets. On our benchmark comprising 75 documents,
                 FlashExtract is able to extract intended data using an
                 average of 2.36 examples in 0.84 seconds per field.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Sousa:2014:CQU,
  author =       "Marcelo Sousa and Isil Dillig and Dimitrios Vytiniotis
                 and Thomas Dillig and Christos Gkantsidis",
  title =        "Consolidation of queries with user-defined functions",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "554--564",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594305",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Motivated by streaming and data analytics scenarios
                 where many queries operate on the same data and perform
                 similar computations, we propose program consolidation
                 for merging multiple user-defined functions (UDFs) that
                 operate on the same input. Program consolidation
                 exploits common computations between UDFs to generate
                 an equivalent optimized function whose execution cost
                 is often much smaller (and never greater) than the sum
                 of the costs of executing each function individually.
                 We present a sound consolidation calculus and an
                 effective algorithm for consolidating multiple UDFs.
                 Our approach is purely static and uses symbolic
                 SMT-based techniques to identify shared or redundant
                 computations. We have implemented the proposed
                 technique on top of the Naiad data processing system.
                 Our experiments show that our algorithm dramatically
                 improves overall job completion time when executing
                 user-defined filters that operate on the same data and
                 perform similar computations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Luu:2014:MCC,
  author =       "Loi Luu and Shweta Shinde and Prateek Saxena and Brian
                 Demsky",
  title =        "A model counter for constraints over unbounded
                 strings",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "565--576",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594331",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Model counting is the problem of determining the
                 number of solutions that satisfy a given set of
                 constraints. Model counting has numerous applications
                 in the quantitative analyses of program execution time,
                 information flow, combinatorial circuit designs as well
                 as probabilistic reasoning. We present a new approach
                 to model counting for structured data types,
                 specifically strings in this work. The key ingredient
                 is a new technique that leverages generating functions
                 as a basic primitive for combinatorial counting. Our
                 tool SMC which embodies this approach can model count
                 for constraints specified in an expressive string
                 language efficiently and precisely, thereby
                 outperforming previous finite-size analysis tools. SMC
                 is expressive enough to model constraints arising in
                 real-world JavaScript applications and UNIX C
                 utilities. We demonstrate the practical feasibility of
                 performing quantitative analyses arising in security
                 applications, such as determining the comparative
                 strengths of password strength meters and determining
                 the information leakage via side channels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Niu:2014:MCF,
  author =       "Ben Niu and Gang Tan",
  title =        "Modular control-flow integrity",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "577--587",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594295",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Control-Flow Integrity (CFI) is a software-hardening
                 technique. It inlines checks into a program so that its
                 execution always follows a predetermined Control-Flow
                 Graph (CFG). As a result, CFI is effective at
                 preventing control-flow hijacking attacks. However,
                 past fine-grained CFI implementations do not support
                 separate compilation, which hinders its adoption. We
                 present Modular Control-Flow Integrity (MCFI), a new
                 CFI technique that supports separate compilation. MCFI
                 allows modules to be independently instrumented and
                 linked statically or dynamically. The combined module
                 enforces a CFG that is a combination of the individual
                 modules' CFGs. One challenge in supporting dynamic
                 linking in multithreaded code is how to ensure a safe
                 transition from the old CFG to the new CFG when
                 libraries are dynamically linked. The key technique we
                 use is to have the CFG represented in a runtime data
                 structure and have reads and updates of the data
                 structure wrapped in transactions to ensure thread
                 safety. Our evaluation on SPECCPU2006 benchmarks shows
                 that MCFI supports separate compilation, incurs low
                 overhead of around 5\%, and enhances security.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Yang:2014:DSL,
  author =       "Edward Z. Yang and David Mazi{\`e}res",
  title =        "Dynamic space limits for {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "6",
  pages =        "588--598",
  month =        jun,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2666356.2594341",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:38:28 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe the semantics and implementation of a
                 space limits system for Haskell, which allows
                 programmers to create resource containers that enforce
                 bounded resident memory usage at runtime. Our system is
                 distinguished by a clear allocator-pays semantics drawn
                 from previous experience with profiling in Haskell and
                 an implementation strategy which uses a
                 block-structured heap to organize containers, allowing
                 us to enforce limits with high accuracy. To deal with
                 the problem of deallocating data in a garbage collected
                 heap, we propose a novel taint-based mechanism that
                 unifies the existing practices of revocable pointers
                 and killing threads in order to reclaim memory. Our
                 system is implemented in GHC, a production-strength
                 compiler for Haskell.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  received =     "PLDI '14 conference proceedings.",
}

@Article{Tsafrir:2014:ELV,
  author =       "Dan Tsafrir",
  title =        "Experiences in the land of virtual abstractions",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "1--2",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576215",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The Microsoft Research Drawbridge Project began with a
                 simple question: Is it possible to achieve the benefits
                 of hardware virtual machines without the overheads?
                 Following that question, we have built a line of
                 exploratory prototypes. These prototypes range from an
                 ARM-based phone that runs x86 Windows binaries to new
                 forms of secure computation. In this talk, I'll briefly
                 describe our various prototypes and the evidence we
                 have accumulated that our first question can be
                 answered in the affirmative.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Hizver:2014:RTD,
  author =       "Jennia Hizver and Tzi-cker Chiueh",
  title =        "Real-time deep virtual machine introspection and its
                 applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "3--14",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576196",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Virtual Machine Introspection (VMI) provides the
                 ability to monitor virtual machines (VM) in an
                 agentless fashion by gathering VM execution states from
                 the hypervisor and analyzing those states to extract
                 information about a running operating system (OS)
                 without installing an agent inside the VM. VMI's main
                 challenge lies in the difficulty in converting
                 low-level byte string values into high-level semantic
                 states of the monitored VM's OS. In this work, we
                 tackle this challenge by developing a real-time kernel
                 data structure monitoring (RTKDSM) system that
                 leverages the rich OS analysis capabilities of
                 Volatility, an open source computer forensics
                 framework, to significantly simplify and automate
                 analysis of VM execution states. The RTKDSM system is
                 designed as an extensible software framework that is
                 meant to be extended to perform application-specific VM
                 state analysis. In addition, the RTKDSM system is able
                 to perform real-time monitoring of any changes made to
                 the extracted OS states of guest VMs. This real-time
                 monitoring capability is especially important for
                 VMI-based security applications. To minimize the
                 performance overhead associated with real-time kernel
                 data structure monitoring, the RTKDSM system has
                 incorporated several optimizations whose effectiveness
                 is reported in this paper.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Arya:2014:TRG,
  author =       "Kapil Arya and Yury Baskakov and Alex Garthwaite",
  title =        "Tesseract: reconciling guest {I/O} and hypervisor
                 swapping in a {VM}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "15--28",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576198",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Double-paging is an often-cited, if unsubstantiated,
                 problem in multi-level scheduling of memory between
                 virtual machines (VMs) and the hypervisor. This problem
                 occurs when both a virtualized guest and the hypervisor
                 overcommit their respective physical address-spaces.
                 When the guest pages out memory previously swapped out
                 by the hypervisor, it initiates an expensive sequence
                 of steps causing the contents to be read in from the
                 hypervisor swapfile only to be written out again,
                 significantly lengthening the time to complete the
                 guest I/O request. As a result, performance rapidly
                 drops. We present Tesseract, a system that directly and
                 transparently addresses the double-paging problem.
                 Tesseract tracks when guest and hypervisor I/O
                 operations are redundant and modifies these I/Os to
                 create indirections to existing disk blocks containing
                 the page contents. Although our focus is on reconciling
                 I/Os between the guest disks and hypervisor swap, our
                 technique is general and can reconcile, or deduplicate,
                 I/Os for guest pages read or written by the VM.
                 Deduplication of disk blocks for file contents accessed
                 in a common manner is well-understood. One challenge
                 that our approach faces is that the locality of guest
                 I/Os (reflecting the guest's notion of disk layout)
                 often differs from that of the blocks in the hypervisor
                 swap. This loss of locality through indirection results
                 in significant performance loss on subsequent guest
                 reads. We propose two alternatives to recovering this
                 lost locality, each based on the idea of asynchronously
                 reorganizing the indirected blocks in persistent
                 storage. We evaluate our system and show that it can
                 significantly reduce the costs of double-paging. We
                 focus our experiments on a synthetic benchmark designed
                 to highlight its effects. In our experiments we observe
                 Tesseract can improve our benchmark's throughput by as
                 much as 200\% when using traditional disks and by as
                 much as 30\% when using SSD. At the same time worst
                 case application responsiveness can be improved by a
                 factor of 5.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Kim:2014:VAM,
  author =       "Hwanju Kim and Sangwook Kim and Jinkyu Jeong and
                 Joonwon Lee",
  title =        "Virtual asymmetric multiprocessor for interactive
                 performance of consolidated desktops",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "29--40",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576199",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "This paper presents virtual asymmetric multiprocessor,
                 a new scheme of virtual desktop scheduling on
                 multi-core processors for user-interactive performance.
                 The proposed scheme enables virtual CPUs to be
                 dynamically performance-asymmetric based on their
                 hosted workloads. To enhance user experience on
                 consolidated desktops, our scheme provides interactive
                 workloads with fast virtual CPUs, which have more
                 computing power than those hosting background workloads
                 in the same virtual machine. To this end, we devise a
                 hypervisor extension that transparently classifies
                 background tasks from potentially interactive
                 workloads. In addition, we introduce a guest extension
                 that manipulates the scheduling policy of an operating
                 system in favor of our hypervisor-level scheme so that
                 interactive performance can be further improved. Our
                 evaluation shows that the proposed scheme significantly
                 improves interactive performance of application launch,
                 Web browsing, and video playback applications when
                 CPU-intensive workloads highly disturb the interactive
                 workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Ben-Yehuda:2014:GMD,
  author =       "Orna Agmon Ben-Yehuda and Eyal Posener and Muli
                 Ben-Yehuda and Assaf Schuster and Ahuva Mu'alem",
  title =        "{Ginseng}: market-driven memory allocation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "41--52",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576197",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Physical memory is the scarcest resource in today's
                 cloud computing platforms. Cloud providers would like
                 to maximize their clients' satisfaction by renting
                 precious physical memory to those clients who value it
                 the most. But real-world cloud clients are selfish:
                 they will only tell their providers the truth about how
                 much they value memory when it is in their own best
                 interest to do so. How can real-world cloud providers
                 allocate memory efficiently to those (selfish) clients
                 who value it the most? We present Ginseng, the first
                 market-driven cloud system that allocates memory
                 efficiently to selfish cloud clients. Ginseng
                 incentivizes selfish clients to bid their true value
                 for the memory they need when they need it. Ginseng
                 continuously collects client bids, finds an efficient
                 memory allocation, and re-allocates physical memory to
                 the clients that value it the most. Ginseng achieves a
                 6.2$ \times $--15.8x improvement (83\%--100\% of the
                 optimum) in aggregate client satisfaction when compared
                 with state-of-the-art approaches for cloud memory
                 allocation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Hwang:2014:MFG,
  author =       "Jinho Hwang and Ahsen Uppal and Timothy Wood and Howie
                 Huang",
  title =        "{Mortar}: filling the gaps in data center memory",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "53--64",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576203",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Data center servers are typically overprovisioned,
                 leaving spare memory and CPU capacity idle to handle
                 unpredictable workload bursts by the virtual machines
                 running on them. While this allows for fast hotspot
                 mitigation, it is also wasteful. Unfortunately, making
                 use of spare capacity without impacting active
                 applications is particularly difficult for memory since
                 it typically must be allocated in coarse chunks over
                 long timescales. In this work we propose repurposing
                 the poorly utilized memory in a data center to store a
                 volatile data store that is managed by the hypervisor.
                 We present two uses for our Mortar framework: as a
                 cache for prefetching disk blocks, and as an
                 application-level distributed cache that follows the
                 memcached protocol. Both prototypes use the framework
                 to ask the hypervisor to store useful, but recoverable
                 data within its free memory pool. This allows the
                 hypervisor to control eviction policies and prioritize
                 access to the cache. We demonstrate the benefits of our
                 prototypes using realistic web applications and disk
                 benchmarks, as well as memory traces gathered from live
                 servers in our university's IT department. By expanding
                 and contracting the data store size based on the free
                 memory available, Mortar improves average response time
                 of a web application by up to 35\% compared to a fixed
                 size memcached deployment, and improves overall video
                 streaming performance by 45\% through prefetching.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Chen:2014:CCB,
  author =       "Licheng Chen and Zhipeng Wei and Zehan Cui and Mingyu
                 Chen and Haiyang Pan and Yungang Bao",
  title =        "{CMD}: classification-based memory deduplication
                 through page access characteristics",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "65--76",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576204",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Limited main memory size is considered as one of the
                 major bottlenecks in virtualization environments.
                 Content-Based Page Sharing (CBPS) is an efficient
                 memory deduplication technique to reduce server memory
                 requirements, in which pages with same content are
                 detected and shared into a single copy. As the widely
                 used implementation of CBPS, Kernel Samepage Merging
                 (KSM) maintains the whole memory pages into two global
                 comparison trees (a stable tree and an unstable tree).
                 To detect page sharing opportunities, each tracked page
                 needs to be compared with pages already in these two
                 large global trees. However since the vast majority of
                 compared pages have different content with it, that
                 will induce massive futility comparisons and thus heavy
                 overhead. In this paper, we propose a lightweight page
                 Classification-based Memory Deduplication approach
                 named CMD to reduce futile page comparison overhead
                 meanwhile to detect page sharing opportunities
                 efficiently. The main innovation of CMD is that pages
                 are grouped into different classifications based on
                 page access characteristics. Pages with similar access
                 characteristics are suggested to have higher
                 possibility with same content, thus they are grouped
                 into the same classification. In CMD, the large global
                 comparison trees are divided into multiple small trees
                 with dedicated local ones in each page classification.
                 Page comparisons are performed just in the same
                 classification, and pages from different
                 classifications are never compared (since they probably
                 result in futile comparisons). The experimental results
                 show that CMD can efficiently reduce page comparisons
                 (by about 68.5\%) meanwhile detect nearly the same (by
                 more than 98\%) or even more page sharing
                 opportunities.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Robatmili:2014:MRL,
  author =       "Behnam Robatmili and Calin Cascaval and Mehrdad
                 Reshadi and Madhukar N. Kedlaya and Seth Fowler and
                 Vrajesh Bhavsar and Michael Weber and Ben Hardekopf",
  title =        "{MuscalietJS}: rethinking layered dynamic web
                 runtimes",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "77--88",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576211",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Layered JavaScript engines, in which the JavaScript
                 runtime is built on top another managed runtime,
                 provide better extensibility and portability compared
                 to traditional monolithic engines. In this paper, we
                 revisit the design of layered JavaScript engines and
                 propose a layered architecture, called MuscalietJS2,
                 that splits the responsibilities of a JavaScript engine
                 between a high-level, JavaScript-specific component and
                 a low-level, language-agnostic .NET VM. To make up for
                 the performance loss due to layering, we propose a two
                 pronged approach: high-level JavaScript optimizations
                 and exploitation of low-level VM features that produce
                 very efficient code for hot functions. We demonstrate
                 the validity of the MuscalietJS design through a
                 comprehensive evaluation using both the Sunspider
                 benchmarks and a set of web workloads. We demonstrate
                 that our approach outperforms other layered engines
                 such as IronJS and Rhino engines while providing
                 extensibility, adaptability and portability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Kalibera:2014:FAS,
  author =       "Tomas Kalibera and Petr Maj and Floreal Morandat and
                 Jan Vitek",
  title =        "A fast abstract syntax tree interpreter for {R}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "89--102",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576205",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Dynamic languages have been gaining popularity to the
                 point that their performance is starting to matter. The
                 effort required to develop a production-quality,
                 high-performance runtime is, however, staggering and
                 the expertise required to do so is often out of reach
                 of the community maintaining a particular language.
                 Many domain specific languages remain stuck with naive
                 implementations, as they are easy to write and simple
                 to maintain for domain scientists. In this paper, we
                 try to see how far one can push a naive implementation
                 while remaining portable and not requiring expertise in
                 compilers and runtime systems. We choose the R
                 language, a dynamic language used in statistics, as the
                 target of our experiment and adopt the simplest
                 possible implementation strategy, one based on
                 evaluation of abstract syntax trees. We build our
                 interpreter on top of a Java virtual machine and use
                 only facilities available to all Java programmers. We
                 compare our results to other implementations of R.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Kedlaya:2014:DDL,
  author =       "Madhukar N. Kedlaya and Behnam Robatmili and Cglin
                 Cascaval and Ben Hardekopf",
  title =        "Deoptimization for dynamic language {JITs} on typed,
                 stack-based virtual machines",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "103--114",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576209",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "We are interested in implementing dynamic language
                 runtimes on top of language-level virtual machines.
                 Type specialization is a critical optimization for
                 dynamic language runtimes: generic code that handles
                 any type of data is replaced with specialized code for
                 particular types observed during execution. However,
                 types can change, and the runtime must recover whenever
                 unexpected types are encountered. The state-of-the-art
                 recovery mechanism is called deoptimization.
                 Deoptimization is a well-known technique for dynamic
                 language runtimes implemented in low-level languages
                 like C. However, no dynamic language runtime
                 implemented on top of a virtual machine such as the
                 Common Language Runtime (CLR) or the Java Virtual
                 Machine (JVM) uses deoptimization, because the
                 implementation thereof used in low-level languages is
                 not possible. In this paper we propose a novel
                 technique that enables deoptimization for dynamic
                 language runtimes implemented on top of typed,
                 stack-based virtual machines. Our technique does not
                 require any changes to the underlying virtual machine.
                 We implement our proposed technique in a JavaScript
                 language implementation, MCJS, running on top of the
                 Mono runtime (CLR). We evaluate our implementation
                 against the current state-of-the-art recovery mechanism
                 for virtual machine-based runtimes, as implemented both
                 in MCJS and in IronJS. We show that deoptimization
                 provides significant performance benefits, even for
                 runtimes running on top of a virtual machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Vitek:2014:CTR,
  author =       "Jan Vitek",
  title =        "The case for the three {R}'s of systems research:
                 repeatability, reproducibility and rigor",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "115--116",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576216",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Computer systems research spans sub-disciplines that
                 include embedded systems, programming languages,
                 networking, and operating systems. In this talk my
                 contention is that a number of structural factors
                 inhibit quality systems research. Symptoms of the
                 problem include unrepeatable and unreproduced results
                 as well as results that are either devoid of meaning or
                 that measure the wrong thing. I will illustrate the
                 impact of these issues on our research output with
                 examples from the development and empirical evaluation
                 of the Schism real-time garbage collection algorithm
                 that is shipped with the FijiVM --- a Java virtual
                 machine for embedded and mobile devices. I will argue
                 that our field should foster: repetition of results,
                 independent reproduction, as well as rigorous
                 evaluation. I will outline some baby steps taken by
                 several computer conferences. In particular I will
                 focus on the introduction of Artifact Evaluation
                 Committees or AECs to ECOOP, OOPLSA, PLDI and soon
                 POPL. The goal of the AECs is to encourage author to
                 package the software artifacts that they used to
                 support the claims made in their paper and to submit
                 these artifacts for evaluation. AECs were carefully
                 designed to provide positive feedback to the authors
                 that take the time to create repeatable research.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Chang:2014:EMV,
  author =       "Chao-Jui Chang and Jan-Jan Wu and Wei-Chung Hsu and
                 Pangfeng Liu and Pen-Chung Yew",
  title =        "Efficient memory virtualization for {Cross-ISA} system
                 mode emulation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "117--128",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576201",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Cross-ISA system-mode emulation has many important
                 applications. For example, Cross-ISA system-mode
                 emulation helps computer architects and OS developers
                 trace and debug kernel execution-flow efficiently by
                 emulating a slower platform (such as ARM) on a more
                 powerful plat-form (such as an x86 machine). Cross-ISA
                 system-mode emulation also enables workload
                 consolidation in data centers with platforms of
                 different instruction-set architectures (ISAs).
                 However, system-mode emulation is much slower. One
                 major overhead in system-mode emulation is the
                 multi-level memory address translation that maps guest
                 virtual address to host physical address. Shadow page
                 tables (SPT) have been used to reduce such overheads,
                 but primarily for same-ISA virtualization. In this
                 paper we propose a novel approach called embedded
                 shadow page tables (ESPT). EPST embeds a shadow page
                 table into the address space of a cross-ISA dynamic
                 binary translation (DBT) and uses hardware memory
                 management unit in the CPU to translate memory
                 addresses, instead of software translation in a current
                 DBT emulator like QEMU. We also use the larger address
                 space on modern 64-bit CPUs to accommodate our DBT
                 emulator so that it will not interfere with the guest
                 operating system. We incorporate our new scheme into
                 QEMU, a popular, retargetable cross-ISA system
                 emulator. SPEC CINT2006 benchmark results indicate that
                 our technique achieves an average speedup of 1.51 times
                 in system mode when emulating ARM on x86, and a 1.59
                 times speedup for emulating IA32 on x86_64.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Zhang:2014:PSS,
  author =       "Mingwei Zhang and Rui Qiao and Niranjan Hasabnis and
                 R. Sekar",
  title =        "A platform for secure static binary instrumentation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "129--140",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576208",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Program instrumentation techniques form the basis of
                 many recent software security defenses, including
                 defenses against common exploits and security policy
                 enforcement. As compared to source-code
                 instrumentation, binary instrumentation is easier to
                 use and more broadly applicable due to the ready
                 availability of binary code. Two key features needed
                 for security instrumentations are (a) it should be
                 applied to all application code, including code
                 contained in various system and application libraries,
                 and (b) it should be non-bypassable. So far, dynamic
                 binary instrumentation (DBI) techniques have provided
                 these features, whereas static binary instrumentation
                 (SBI) techniques have lacked them. These features,
                 combined with ease of use, have made DBI the de facto
                 choice for security instrumentations. However, DBI
                 techniques can incur high overheads in several common
                 usage scenarios, such as application startups,
                 system-calls, and many real-world applications. We
                 therefore develop a new platform for secure static
                 binary instrumentation (PSI) that overcomes these
                 drawbacks of DBI techniques, while retaining the
                 security, robustness and ease-of-use features. We
                 illustrate the versatility of PSI by developing several
                 instrumentation applications: basic block counting,
                 shadow stack defense against control-flow hijack and
                 return-oriented programming attacks, and system call
                 and library policy enforcement. While being competitive
                 with the best DBI tools on CPU-intensive SPEC 2006
                 benchmark, PSI provides an order of magnitude reduction
                 in overheads on a collection of real-world
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Lyu:2014:DER,
  author =       "Yi-Hong Lyu and Ding-Yong Hong and Tai-Yi Wu and
                 Jan-Jan Wu and Wei-Chung Hsu and Pangfeng Liu and
                 Pen-Chung Yew",
  title =        "{DBILL}: an efficient and retargetable dynamic binary
                 instrumentation framework using {LLVM} backend",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "141--152",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576213",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic Binary Instrumentation (DBI) is a core
                 technology for building debugging and profiling tools
                 for application executables. Most state-of-the-art DBI
                 systems have focused on the same instruction set
                 architecture (ISA) where the guest binary and the host
                 binary have the same ISA. It is uncommon to have a
                 cross-ISA DBI system, such as a system that instruments
                 ARM executables to run on x86 machines. We believe
                 cross-ISA DBI systems are increasingly more important,
                 since ARM executables could be more productively
                 analyzed on x86 based machines such as commonly
                 available PCs and servers. In this paper, we present
                 DBILL, a cross-ISA and retargetable dynamic binary
                 instrumentation framework that builds on both QEMU and
                 LLVM. The DBILL framework enables LLVM-based static
                 instrumentation tools to become DBI ready, and
                 deployable to different target architectures. Using
                 address sanitizer and memory sanitizer as
                 implementation examples, we show DBILL is an efficient,
                 versatile and easy to use cross-ISA retargetable DBI
                 framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Zheng:2014:CCM,
  author =       "Jie Zheng and Tze Sing Eugene Ng and Kunwadee
                 Sripanidkulchai and Zhaolei Liu",
  title =        "{COMMA}: coordinating the migration of multi-tier
                 applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "153--164",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576200",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Multi-tier applications are widely deployed in today's
                 virtualized cloud computing environments. At the same
                 time, management operations in these virtualized
                 environments, such as load balancing, hardware
                 maintenance, workload consolidation, etc., often make
                 use of live virtual machine (VM) migration to control
                 the placement of VMs. Although existing solutions are
                 able to migrate a single VM efficiently, little
                 attention has been devoted to migrating related VMs in
                 multi-tier applications. Ignoring the relatedness of
                 VMs during migration can lead to serious application
                 performance degradation. This paper formulates the
                 multi-tier application migration problem, and presents
                 a new communication-impact-driven coordinated approach,
                 as well as a system called COMMA that realizes this
                 approach. Through extensive testbed experiments,
                 numerical analyses, and a demonstration of COMMA on
                 Amazon EC2, we show that this approach is highly
                 effective in minimizing migration's impact on
                 multi-tier applications' performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Kumar:2014:FBE,
  author =       "Vivek Kumar and Stephen M. Blackburn and David Grove",
  title =        "Friendly barriers: efficient work-stealing with return
                 barriers",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "165--176",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576207",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper addresses the problem of efficiently
                 supporting parallelism within a managed runtime. A
                 popular approach for exploiting software parallelism on
                 parallel hardware is task parallelism, where the
                 programmer explicitly identifies potential parallelism
                 and the runtime then schedules the work. Work-stealing
                 is a promising scheduling strategy that a runtime may
                 use to keep otherwise idle hardware busy while
                 relieving overloaded hardware of its burden. However,
                 work-stealing comes with substantial overheads. Recent
                 work identified sequential overheads of work-stealing,
                 those that occur even when no stealing takes place, as
                 a significant source of overhead. That work was able to
                 reduce sequential overheads to just 15\%. In this work,
                 we turn to dynamic overheads, those that occur each
                 time a steal takes place. We show that the dynamic
                 overhead is dominated by introspection of the victim's
                 stack when a steal takes place. We exploit the idea of
                 a low overhead return barrier to reduce the dynamic
                 overhead by approximately half, resulting in total
                 performance improvements of as much as 20\%. Because,
                 unlike prior work, we attack the overheads directly due
                 to stealing and therefore attack the overheads that
                 grow as parallelism grows, we improve the scalability
                 of work-stealing applications. This result is
                 complementary to recent work addressing the sequential
                 overheads of work-stealing. This work therefore
                 substantially relieves work-stealing of the increasing
                 pressure due to increasing intra-node hardware
                 parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Horie:2014:SDJ,
  author =       "Michihiro Horie and Kazunori Ogata and Kiyokuni
                 Kawachiya and Tamiya Onodera",
  title =        "String deduplication for {Java}-based middleware in
                 virtualized environments",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "177--188",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576210",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "To increase the memory efficiency in physical servers
                 is a significant concern for increasing the number of
                 virtual machines (VM) in them. When similar web
                 application service runs in each guest VM, many string
                 data with the same values are created in every guest
                 VMs. These duplications of string data are redundant
                 from the viewpoint of memory efficiency in the host OS.
                 This paper proposes two approaches to reduce the
                 duplication in Java string in a single Java VM (JVM)
                 and across JVMs. The first approach is to share string
                 objects cross JVMs by using a read-only memory-mapped
                 file. The other approach is to selectively unify string
                 objects created at runtime in the web applications.
                 This paper evaluates our approach by using the Apache
                 DayTrader and the DaCapo benchmark suite. Our prototype
                 implementation achieved 7\% to 12\% reduction in the
                 total size of the objects allocated over the lifetime
                 of the programs. In addition, we observed the
                 performance of DayTrader was maintained even under a
                 situation of high density guest VMs in a KVM host
                 machine.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Stecklina:2014:SHO,
  author =       "Julian Stecklina",
  title =        "Shrinking the hypervisor one subsystem at a time: a
                 userspace packet switch for virtual machines",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "189--200",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576202",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Efficient and secure networking between virtual
                 machines is crucial in a time where a large share of
                 the services on the Internet and in private datacenters
                 run in virtual machines. To achieve this efficiency,
                 virtualization solutions, such as Qemu/KVM, move toward
                 a monolithic system architecture in which all
                 performance critical functionality is implemented
                 directly in the hypervisor in privileged mode. This is
                 an attack surface in the hypervisor that can be used
                 from compromised VMs to take over the virtual machine
                 host and all VMs running on it. We show that it is
                 possible to implement an efficient network switch for
                 virtual machines as an unprivileged userspace component
                 running in the host system including the driver for the
                 upstream network adapter. Our network switch relies on
                 functionality already present in the KVM hypervisor and
                 requires no changes to Linux, the host operating
                 system, and the guest. Our userspace implementation
                 compares favorably to the existing in-kernel
                 implementation with respect to throughput and latency.
                 We reduced per-packet overhead by using a
                 run-to-completion model an are able to outperform the
                 unmodified system for VM-to-VM traffic by a large
                 margin when packet rates are high.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Li:2014:VSK,
  author =       "Ye Li and Richard West and Eric Missimer",
  title =        "A virtualized separation kernel for mixed criticality
                 systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "201--212",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576206",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Multi- and many-core processors are becoming
                 increasingly popular in embedded systems. Many of these
                 processors now feature hardware virtualization
                 capabilities, such as the ARM Cortex A15, and x86
                 processors with Intel VT-x or AMD-V support. Hardware
                 virtualization offers opportunities to partition
                 physical resources, including processor cores, memory
                 and I/O devices amongst guest virtual machines. Mixed
                 criticality systems and services can then co-exist on
                 the same platform in separate virtual machines.
                 However, traditional virtual machine systems are too
                 expensive because of the costs of trapping into
                 hypervisors to multiplex and manage machine physical
                 resources on behalf of separate guests. For example,
                 hypervisors are needed to schedule separate VMs on
                 physical processor cores. In this paper, we discuss the
                 design of the Quest-V separation kernel, which
                 partitions services of different criticalities in
                 separate virtual machines, or sandboxes. Each sandbox
                 encapsulates a subset of machine physical resources
                 that it manages without requiring intervention of a
                 hypervisor. Moreover, a hypervisor is not needed for
                 normal operation, except to bootstrap the system and
                 establish communication channels between sandboxes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Johnson:2014:CML,
  author =       "David Johnson and Mike Hibler and Eric Eric",
  title =        "Composable multi-level debugging with {Stackdb}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "7",
  pages =        "213--226",
  month =        jul,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2674025.2576212",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Sep 26 07:29:50 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Virtual machine introspection (VMI) allows users to
                 debug software that executes within a virtual machine.
                 To support rich, whole-system analyses, a VMI tool must
                 inspect and control systems at multiple levels of the
                 software stack. Traditional debuggers enable inspection
                 and control, but they limit users to treating a whole
                 system as just one kind of target: e.g., just a kernel,
                 or just a process, but not both. We created Stackdb, a
                 debugging library with VMI support that allows one to
                 monitor and control a whole system through multiple,
                 coordinated targets. A target corresponds to a
                 particular level of the system's software stack;
                 multiple targets allow a user to observe a VM guest at
                 several levels of abstraction simultaneously. For
                 example, with Stackdb, one can observe a PHP script
                 running in a Linux process in a Xen VM via three
                 coordinated targets at the language, process, and
                 kernel levels. Within Stackdb, higher-level targets are
                 components that utilize lower-level targets; a key
                 contribution of Stackdb is its API that supports
                 multi-level and flexible ``stacks'' of targets. This
                 paper describes the challenges we faced in creating
                 Stackdb, presents the solutions we devised, and
                 evaluates Stackdb through its application to a
                 security-focused, whole-system case study.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '14 conference proceedings.",
}

@Article{Hill:2014:CCA,
  author =       "Mark D. Hill",
  title =        "21st century computer architecture",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "1--2",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2558890",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This talk has two parts. The first part will discuss
                 possible directions for computer architecture research,
                 including architecture as infrastructure, energy first,
                 impact of new technologies, and cross-layer
                 opportunities. This part is based on a 2012 Computing
                 Community Consortium (CCC) whitepaper effort led by
                 Hill, as well as other recent National Academy and ISAT
                 studies. See:
                 \url{http://cra.org/ccc/docs/init/21stcenturyarchitecturewhitepaper.pdf}.
                 The second part of the talk will discuss one or more
                 examples of cross-layer research advocated in the
                 first part. For example, our analysis shows that many
                 ``big-memory'' server workloads, such as databases,
                 in-memory caches, and graph analytics, pay a high cost
                 for page-based virtual memory: up to 50\% of execution
                 time wasted. Via small changes to the operating system
                 (Linux) and hardware (x86-64 MMU), this work reduces
                 execution time these workloads waste to less than
                 0.5\%. The key idea is to map part of a process's
                 linear virtual address space with a new incarnation of
                 segmentation, while providing compatibility by mapping
                 the rest of the virtual address space with paging.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Liu:2014:PPF,
  author =       "Tongping Liu and Chen Tian and Ziang Hu and Emery D.
                 Berger",
  title =        "{PREDATOR}: predictive false sharing detection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "3--14",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555244",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "False sharing is a notorious problem for multithreaded
                 applications that can drastically degrade both
                 performance and scalability. Existing approaches can
                 precisely identify the sources of false sharing, but
                 only report false sharing actually observed during
                 execution; they do not generalize across executions.
                 Because false sharing is extremely sensitive to object
                 layout, these detectors can easily miss false sharing
                 problems that can arise due to slight differences in
                 memory allocation order or object placement decisions
                 by the compiler. In addition, they cannot predict the
                 impact of false sharing on hardware with different
                 cache line sizes. This paper presents PREDATOR, a
                 predictive software-based false sharing detector.
                 PREDATOR generalizes from a single execution to
                 precisely predict false sharing that is latent in the
                 current execution. PREDATOR tracks accesses within a
                 range that could lead to false sharing given different
                 object placement. It also tracks accesses within
                 virtual cache lines, contiguous memory ranges that span
                 actual hardware cache lines, to predict sharing on
                 hardware platforms with larger cache line sizes. For
                 each, it reports the exact program location of
                 predicted false sharing problems, ranked by their
                 projected impact on performance. We evaluate PREDATOR
                 across a range of benchmarks and actual applications.
                 PREDATOR identifies problems undetectable with previous
                 tools, including two previously-unknown false sharing
                 problems, with no false positives. PREDATOR is able to
                 immediately locate false sharing problems in MySQL and
                 the Boost library that had eluded detection for
                 years.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Thomson:2014:CTU,
  author =       "Paul Thomson and Alastair F. Donaldson and Adam
                 Betts",
  title =        "Concurrency testing using schedule bounding: an
                 empirical study",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "15--28",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555260",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the first independent empirical study on
                 schedule bounding techniques for systematic concurrency
                 testing (SCT). We have gathered 52 buggy concurrent
                 software benchmarks, drawn from public code bases,
                 which we call SCTBench. We applied a modified version
                 of an existing concurrency testing tool to SCTBench to
                 attempt to answer several research questions,
                 including: How effective are the two main schedule
                 bounding techniques, preemption bounding and delay
                 bounding, at bug finding? What challenges are
                 associated with applying SCT to existing code? How
                 effective is schedule bounding compared to a naive
                 random scheduler at finding bugs? Our findings confirm
                 that delay bounding is superior to preemption bounding
                 and that schedule bounding is more effective at finding
                 bugs than unbounded depth-first search. The majority of
                 bugs in SCTBench can be exposed using a small bound
                 (1-3), supporting previous claims, but there is at
                 least one benchmark that requires 5 preemptions.
                 Surprisingly, we found that a naive random scheduler is
                 at least as effective as schedule bounding for finding
                 bugs. We have made SCTBench and our tools publicly
                 available for reproducibility and use in future work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Samak:2014:TDD,
  author =       "Malavika Samak and Murali Krishna Ramanathan",
  title =        "Trace driven dynamic deadlock detection and
                 reproduction",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "29--42",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555262",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic analysis techniques have been proposed to
                 detect potential deadlocks. Analyzing and comprehending
                 each potential deadlock to determine whether the
                 deadlock is feasible in a real execution requires
                 significant programmer effort. Moreover, empirical
                 evidence shows that existing analyses are quite
                 imprecise. This imprecision of the analyses further
                 void the manual effort invested in reasoning about
                 non-existent defects. In this paper, we address the
                 problems of imprecision of existing analyses and the
                 subsequent manual effort necessary to reason about
                 deadlocks. We propose a novel approach for deadlock
                 detection by designing a dynamic analysis that
                 intelligently leverages execution traces. To reduce the
                 manual effort, we replay the program by making the
                 execution follow a schedule derived based on the
                 observed trace. For a real deadlock, its feasibility is
                 automatically verified if the replay causes the
                 execution to deadlock. We have implemented our approach
                 as part of WOLF and have analyzed many large (upto
                 160KLoC) Java programs. Our experimental results show
                 that we are able to identify 74\% of the reported
                 defects as true (or false) positives automatically
                 leaving very few defects for manual analysis. The
                 overhead of our approach is negligible making it a
                 compelling tool for practical adoption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Chiang:2014:ESI,
  author =       "Wei-Fan Chiang and Ganesh Gopalakrishnan and Zvonimir
                 Rakamaric and Alexey Solovyev",
  title =        "Efficient search for inputs causing high
                 floating-point errors",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "43--52",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555265",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Tools for floating-point error estimation are
                 fundamental to program understanding and optimization.
                 In this paper, we focus on tools for determining the
                 input settings to a floating point routine that
                 maximizes its result error. Such tools can help support
                 activities such as precision allocation, performance
                 optimization, and auto-tuning. We benchmark current
                 abstraction-based precision analysis methods, and show
                 that they often do not work at scale, or generate
                 highly pessimistic error estimates, often caused by
                 non-linear operators or complex input constraints that
                 define the set of legal inputs. We show that while
                 concrete-testing-based error estimation methods based
                 on maintaining shadow values at higher precision can
                 search out higher error-inducing inputs, suit able
                 heuristic search guidance is key to finding higher
                 errors. We develop a heuristic search algorithm called
                 Binary Guided Random Testing (BGRT). In 45 of the 48
                 total benchmarks, including many real-world routines,
                 BGRT returns higher guaranteed errors. We also evaluate
                 BGRT against two other heuristic search methods called
                 ILS and PSO, obtaining better results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Tardieu:2014:XAP,
  author =       "Olivier Tardieu and Benjamin Herta and David
                 Cunningham and David Grove and Prabhanjan Kambadur and
                 Vijay Saraswat and Avraham Shinnar and Mikio Takeuchi
                 and Mandana Vaziri",
  title =        "{X10} and {APGAS} at Petascale",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "53--66",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555245",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "X10 is a high-performance, high-productivity
                 programming language aimed at large-scale distributed
                 and shared-memory parallel applications. It is based on
                 the Asynchronous Partitioned Global Address Space
                 (APGAS) programming model, supporting the same
                 fine-grained concurrency mechanisms within and across
                 shared-memory nodes. We demonstrate that X10 delivers
                 solid performance at petascale by running (weak
                 scaling) eight application kernels on an IBM Power 775
                 supercomputer utilizing up to 55,680 Power7 cores (for
                 1.7 Pflop/s of theoretical peak performance). We detail
                 our advances in distributed termination detection,
                 distributed load balancing, and use of high-performance
                 interconnects that enable X10 to scale out to tens of
                 thousands of cores. For the four HPC Class 2 Challenge
                 benchmarks, X10 achieves 41\% to 87\% of the system's
                 potential at scale (as measured by IBM's HPCC Class 1
                 optimized runs). We also implement K-Means,
                 Smith-Waterman, Betweenness Centrality, and Unbalanced
                 Tree Search (UTS) for geometric trees. Our UTS
                 implementation is the first to scale to petaflop
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Cunningham:2014:RXE,
  author =       "David Cunningham and David Grove and Benjamin Herta
                 and Arun Iyengar and Kiyokuni Kawachiya and Hiroki
                 Murata and Vijay Saraswat and Mikio Takeuchi and
                 Olivier Tardieu",
  title =        "Resilient {X10}: efficient failure-aware programming",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "67--80",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555248",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scale-out programs run on multiple processes in a
                 cluster. In scale-out systems, processes can fail.
                 Computations using traditional libraries such as MPI
                 fail when any component process fails. The advent of
                 Map Reduce, Resilient Data Sets and MillWheel has shown
                 dramatic improvements in productivity are possible when
                 a high-level programming framework handles scale-out
                 and resilience automatically. We are concerned with the
                 development of general-purpose languages that support
                 resilient programming. In this paper we show how the
                 X10 language and implementation can be extended to
                 support resilience. In Resilient X10, places may fail
                 asynchronously, causing loss of the data and tasks at
                 the failed place. Failure is exposed through
                 exceptions. We identify a {\em Happens Before
                 Invariance Principle} and require the runtime to
                 automatically repair the global control structure of
                 the program to maintain this principle. We show this
                 reduces much of the burden of resilient programming.
                 The programmer is only responsible for continuing
                 execution with fewer computational resources and the
                 loss of part of the heap, and can do so while taking
                 advantage of domain knowledge. We build a complete
                 implementation of the language, capable of executing
                 benchmark applications on hundreds of nodes. We
                 describe the algorithms required to make the language
                 runtime resilient. We then give three applications,
                 each with a different approach to fault tolerance
                 (replay, decimation, and domain-level checkpointing).
                 These can be executed at scale and survive node
                 failure. We show that for these programs the overhead
                 of resilience is a small fraction of overall runtime by
                 comparing to equivalent non-resilient X10 programs. On
                 one program we show end-to-end performance of Resilient
                 X10 is ~100x faster than Hadoop.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Yang:2014:PMI,
  author =       "Chaoran Yang and Wesley Bland and John Mellor-Crummey
                 and Pavan Balaji",
  title =        "Portable, {MPI}-interoperable {Coarray Fortran}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "81--92",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555270",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The past decade has seen the advent of a number of
                 parallel programming models such as Coarray Fortran
                 (CAF), Unified Parallel C, X10, and Chapel. Despite the
                 productivity gains promised by these models, most
                 parallel scientific applications still rely on MPI as
                 their data movement model. One reason for this trend is
                 that it is hard for users to incrementally adopt these
                 new programming models in existing MPI applications.
                 Because each model use its own runtime system, they
                 duplicate resources and are potentially error-prone.
                 Such independent runtime systems were deemed necessary
                 because MPI was considered insufficient in the past to
                 play this role for these languages. The recently
                 released MPI-3, however, adds several new capabilities
                 that now provide all of the functionality needed to act
                 as a runtime, including a much more comprehensive
                 one-sided communication framework. In this paper, we
                 investigate how MPI-3 can form a runtime system for one
                 example programming model, CAF, with a broader goal of
                 enabling a single application to use both MPI and CAF
                 with the highest level of interoperability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Yang:2014:CNR,
  author =       "Yi Yang and Huiyang Zhou",
  title =        "{CUDA-NP}: realizing nested thread-level parallelism
                 in {GPGPU} applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "93--106",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555254",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parallel programs consist of series of code sections
                 with different thread-level parallelism (TLP). As a
                 result, it is rather common that a thread in a parallel
                 program, such as a GPU kernel in CUDA programs, still
                 contains both sequential code and parallel loops. In
                 order to leverage such parallel loops, the latest
                 Nvidia Kepler architecture introduces dynamic
                 parallelism, which allows a GPU thread to start another
                 GPU kernel, thereby reducing the overhead of launching
                 kernels from a CPU. However, with dynamic parallelism,
                 a parent thread can only communicate with its child
                 threads through global memory and the overhead of
                 launching GPU kernels is non-trivial even within GPUs.
                 In this paper, we first study a set of GPGPU benchmarks
                 that contain parallel loops, and highlight that these
                 bench-marks do not have a very high loop count or high
                 degrees of TLP. Consequently, the benefits of
                 leveraging such parallel loops using dynamic
                 parallelism are too limited to offset its overhead. We
                 then present our proposed solution to exploit nested
                 parallelism in CUDA, referred to as CUDA-NP. With
                 CUDA-NP, we initially enable a high number of threads
                 when a GPU program starts, and use control flow to
                 activate different numbers of threads for different
                 code sections. We implemented our proposed CUDA-NP
                 framework using a directive-based compiler approach.
                 For a GPU kernel, an application developer only needs
                 to add OpenMP-like pragmas for parallelizable code
                 sections. Then, our CUDA-NP compiler automatically
                 generates the optimized GPU kernels. It supports both
                 the reduction and the scan primitives, explores
                 different ways to distribute parallel loop iterations
                 into threads, and efficiently manages on-chip resource.
                 Our experiments show that for a set of GPGPU
                 benchmarks, which have already been optimized and
                 contain nested parallelism, our pro-posed CUDA-NP
                 framework further improves the performance by up to
                 6.69 times and 2.18 times on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Yan:2014:YYA,
  author =       "Shengen Yan and Chao Li and Yunquan Zhang and Huiyang
                 Zhou",
  title =        "{yaSpMV}: yet another {SpMV} framework on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "107--118",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555255",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "SpMV is a key linear algebra algorithm and has been
                 widely used in many important application domains. As a
                 result, numerous attempts have been made to optimize
                 SpMV on GPUs to leverage their massive computational
                 throughput. Although the previous work has shown
                 impressive progress, load imbalance and high memory
                 bandwidth remain the critical performance bottlenecks
                 for SpMV. In this paper, we present our novel solutions
                 to these problems. First, we devise a new SpMV format,
                 called blocked compressed common coordinate (BCCOO),
                 which uses bit flags to store the row indices in a
                 blocked common coordinate (COO) format so as to
                 alleviate the bandwidth problem. We further improve
                 this format by partitioning the matrix into vertical
                 slices to enhance the cache hit rates when accessing
                 the vector to be multiplied. Second, we revisit the
                 segmented scan approach for SpMV to address the load
                 imbalance problem. We propose a highly efficient
                 matrix-based segmented sum/scan for SpMV and further
                 improve it by eliminating global synchronization. Then,
                 we introduce an auto-tuning framework to choose
                 optimization parameters based on the characteristics of
                 input sparse matrices and target hardware platforms.
                 Our experimental results on GTX680 GPUs and GTX480 GPUs
                 show that our proposed framework achieves significant
                 performance improvement over the vendor tuned CUSPARSE
                 V5.0 (up to 229\% and 65\% on average on GTX680 GPUs,
                 up to 150\% and 42\% on average on GTX480 GPUs) and
                 some most recently proposed schemes (e.g., up to 195\%
                 and 70\% on average over clSpMV on GTX680 GPUs, up to
                 162\% and 40\% on average over clSpMV on GTX480
                 GPUs).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Bauer:2014:SLW,
  author =       "Michael Bauer and Sean Treichler and Alex Aiken",
  title =        "{Singe}: leveraging warp specialization for high
                 performance on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "119--130",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555258",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Singe, a Domain Specific Language (DSL)
                 compiler for combustion chemistry that leverages warp
                 specialization to produce high performance code for
                 GPUs. Instead of relying on traditional GPU programming
                 models that emphasize data-parallel computations, warp
                 specialization allows compilers like Singe to partition
                 computations into sub-computations which are then
                 assigned to different warps within a thread block.
                 Fine-grain synchronization between warps is performed
                 efficiently in hardware using producer-consumer named
                 barriers. Partitioning computations using warp
                 specialization allows Singe to deal efficiently with
                 the irregularity in both data access patterns and
                 computation. Furthermore, warp-specialized partitioning
                 of computations allows Singe to fit extremely large
                 working sets into on-chip memories. Finally, we
                 describe the architecture and general compilation
                 techniques necessary for constructing a
                 warp-specializing compiler. We show that the
                 warp-specialized code emitted by Singe is up to 3.75X
                 faster than previously optimized data-parallel GPU
                 kernels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Odaira:2014:EGI,
  author =       "Rei Odaira and Jose G. Castanos and Hisanobu Tomari",
  title =        "Eliminating global interpreter locks in {Ruby} through
                 hardware transactional memory",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "131--142",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555247",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many scripting languages use a Global Interpreter Lock
                 (GIL) to simplify the internal designs of their
                 interpreters, but this kind of lock severely lowers the
                 multi-thread performance on multi-core machines. This
                 paper presents our first results eliminating the GIL in
                 Ruby using Hardware Transactional Memory (HTM) in the
                 IBM zEnterprise EC12 and Intel 4th Generation Core
                 processors. Though prior prototypes replaced a GIL with
                 HTM, we tested realistic programs, the Ruby NAS
                 Parallel Benchmarks (NPB), the WEBrick HTTP server, and
                 Ruby on Rails. We devised a new technique to
                 dynamically adjust the transaction lengths on a
                 per-bytecode basis, so that we can optimize the
                 likelihood of transaction aborts against the relative
                 overhead of the instructions to begin and end the
                 transactions. Our results show that HTM achieved 1.9-
                 to 4.4-fold speedups in the NPB programs over the GIL
                 with 12 threads, and 1.6- and 1.2-fold speedups in
                 WEBrick and Ruby on Rails, respectively. The dynamic
                 transaction-length adjustment chose the best
                 transaction lengths for any number of threads and
                 applications with sufficiently long running times.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Petrovic:2014:LHM,
  author =       "Darko Petrovi{\'c} and Thomas Ropars and Andr{\'e}
                 Schiper",
  title =        "Leveraging hardware message passing for efficient
                 thread synchronization",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "143--154",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555251",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As the level of parallelism in manycore processors
                 keeps increasing, providing efficient mechanisms for
                 thread synchronization in concurrent programs is
                 becoming a major concern. On cache-coherent
                 shared-memory processors, synchronization efficiency is
                 ultimately limited by the performance of the underlying
                 cache coherence protocol. This paper studies how
                 hardware support for message passing can improve
                 synchronization performance. Considering the ubiquitous
                 problem of mutual exclusion, we adapt two
                 state-of-the-art solutions used on shared-memory
                 processors, namely the server approach and the
                 combining approach, to leverage the potential of
                 hardware message passing. We propose HybComb, a novel
                 combining algorithm that uses both message passing and
                 shared memory features of emerging hybrid processors.
                 We also introduce MP-Server, a straightforward
                 adaptation of the server approach to hardware message
                 passing. Evaluation on Tilera's TILE-Gx processor shows
                 that MP-Server can execute contended critical sections
                 with unprecedented throughput, as stalls related to
                 cache coherence are removed from the critical path.
                 HybComb can achieve comparable performance, while
                 avoiding the need to dedicate server cores.
                 Consequently, our queue and stack implementations,
                 based on MP-Server and HybComb, largely outperform
                 their most efficient pure-shared-memory counterparts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Herlihy:2014:WSF,
  author =       "Maurice Herlihy and Zhiyu Liu",
  title =        "Well-structured futures and cache locality",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "155--166",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555257",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In fork-join parallelism, a sequential program is
                 split into a directed acyclic graph of tasks linked by
                 directed dependency edges, and the tasks are executed,
                 possibly in parallel, in an order consistent with their
                 dependencies. A popular and effective way to extend
                 fork-join parallelism is to allow threads to create
                 {futures. A thread creates a future to hold the results
                 of a computation, which may or may not be executed in
                 parallel. That result is returned when some thread
                 touches that future, blocking if necessary until the
                 result is ready. Recent research has shown that while
                 futures can, of course, enhance parallelism in a
                 structured way, they can have a deleterious effect on
                 cache locality. In the worst case, futures can incur
                 \Omega (P T \infty + t T \infty ) deviations, which
                 implies \Omega (C P T \infty + C t T \infty )
                 additional cache misses, where C is the number of cache
                 lines, P is the number of processors, t is the number
                 of touches, and T \infty is the computation span. Since
                 cache locality has a large impact on software
                 performance on modern multicores, this result is
                 troubling. In this paper, however, we show that if
                 futures are used in a simple, disciplined way, then the
                 situation is much better: if each future is touched
                 only once, either by the thread that created it, or by
                 a later descendant of the thread that created it, then
                 parallel executions with work stealing can incur at
                 most O(C P T$^2$ \infty ) additional cache misses, a
                 substantial improvement. This structured use of futures
                 is characteristic of many (but not all) parallel
                 applications.}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Diegues:2014:TWL,
  author =       "Nuno Diegues and Paolo Romano",
  title =        "{Time-Warp}: lightweight abort minimization in
                 transactional memory",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "167--178",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555259",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The notion of permissiveness in Transactional Memory
                 (TM) translates to only aborting a transaction when it
                 cannot be accepted in any history that guarantees
                 correctness criterion. This property is neglected by
                 most TMs, which, in order to maximize implementation's
                 efficiency, resort to aborting transactions under
                 overly conservative conditions. In this paper we seek
                 to identify a sweet spot between permissiveness and
                 efficiency by introducing the Time-Warp Multi-version
                 algorithm (TWM). TWM is based on the key idea of
                 allowing an update transaction that has performed stale
                 reads (i.e., missed the writes of concurrently
                 committed transactions) to be serialized by committing
                 it in the past, which we call a time-warp commit. At
                 its core, TWM uses a novel, lightweight validation
                 mechanism with little computational overheads. TWM also
                 guarantees that read-only transactions can never be
                 aborted. Further, TWM guarantees Virtual World
                 Consistency, a safety property that is deemed as
                 particularly relevant in the context of TM. We
                 demonstrate the practicality of this approach through
                 an extensive experimental study, where we compare TWM
                 with four other TMs, and show an average performance
                 improvement of 65\% in high concurrency scenarios.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Olukotun:2014:BPP,
  author =       "Kunle Olukotun",
  title =        "Beyond parallel programming with domain specific
                 languages",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "179--180",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2557966",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today, almost all computer architectures are parallel
                 and heterogeneous; a combination of multiple CPUs, GPUs
                 and specialized processors. This creates a challenging
                 problem for application developers who want to develop
                 high performance programs without the effort required
                 to use low-level, architecture specific parallel
                 programming models (e.g., OpenMP for CMPs, CUDA for
                 GPUs, MPI for clusters). Domain-specific languages
                 (DSLs) are a promising solution to this problem because
                 they can provide an avenue for high-level
                 application-specific abstractions with implicit
                 parallelism to be mapped directly to low level
                 architecture-specific programming models; providing
                 both high programmer productivity and high execution
                 performance. In this talk I will describe an approach
                 to building high performance DSLs, which is based on
                 DSL embedding in a general purpose programming
                 language, metaprogramming and a DSL infrastructure
                 called Delite. I will describe how we transform DSL
                 programs into efficient first-order low-level code
                 using domain specific optimization, parallelism and
                 locality optimization with parallel patterns, and
                 architecture-specific code generation. All
                 optimizations and transformations are implemented in
                 Delite: an extensible DSL compiler infrastucture that
                 significantly reduces the effort required to develop
                 new DSLs. Delite DSLs for machine learning, data
                 querying, graph analysis, and scientific computing all
                 achieve performance competitive with manually
                 parallelized C++ code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Song:2014:DAT,
  author =       "Sukhyun Song and Jeffrey K. Hollingsworth",
  title =        "Designing and auto-tuning parallel {$3$-D FFT} for
                 computation-communication overlap",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "181--192",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555249",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a method to design and auto-tune a
                 new parallel 3-D FFT code using the non-blocking MPI
                 all-to-all operation. We achieve high performance by
                 optimizing computation-communication overlap. Our code
                 performs fully asynchronous communication without any
                 support from special hardware. We also improve cache
                 performance through loop tiling. To cope with the
                 complex trade-off regarding our optimization
                 techniques, we parameterize our code and auto-tune the
                 parameters efficiently in a large parameter space.
                 Experimental results from two systems confirm that our
                 code achieves a speedup of up to 1.76x over the FFTW
                 library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Catanzaro:2014:DPM,
  author =       "Bryan Catanzaro and Alexander Keller and Michael
                 Garland",
  title =        "A decomposition for in-place matrix transposition",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "193--206",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555253",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a decomposition for in-place matrix
                 transposition, with applications to Array of Structures
                 memory accesses on SIMD processors. Traditional
                 approaches to in-place matrix transposition involve
                 cycle following, which is difficult to parallelize, and
                 on matrices of dimension $m$ by $n$ require $O(mn \log
                 mn)$ work when limited to less than $O(mn)$ auxiliary
                 space. Our decomposition allows the rows and columns to
                 be operated on independently during in-place
                 transposition, reducing work complexity to $O(mn)$,
                 given $O(\max(m, n))$ auxiliary space. This
                 decomposition leads to an efficient and naturally
                 parallel algorithm: we have measured median throughput
                 of 19.5 GB/s on an NVIDIA Tesla K20c processor. An
                 implementation specialized for the skinny matrices that
                 arise when converting Arrays of Structures to
                 Structures of Arrays yields median throughput of 34.3
                 GB/s, and a maximum throughput of 51 GB/s. Because of
                 the simple structure of this algorithm, it is
                 particularly suited for implementation using SIMD
                 instructions to transpose the small arrays that arise
                 when SIMD processors load from or store to Arrays of
                 Structures. Using this algorithm to cooperatively
                 perform accesses to Arrays of Structures, we measure
                 180 GB/s throughput on the K20c, which is up to 45
                 times faster than compiler-generated Array of
                 Structures accesses. In this paper, we explain the
                 algorithm, prove its correctness and complexity, and
                 explain how it can be instantiated efficiently for
                 solving various transpose problems on both CPUs and
                 GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Sung:2014:PTR,
  author =       "I-Jui Sung and Juan G{\'o}mez-Luna and Jos{\'e}
                 Mar{\'\i}a Gonz{\'a}lez-Linares and Nicol{\'a}s Guil
                 and Wen-Mei W. Hwu",
  title =        "In-place transposition of rectangular matrices on
                 accelerators",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "207--218",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555266",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Matrix transposition is an important algorithmic
                 building block for many numeric algorithms such as FFT.
                 It has also been used to convert the storage layout of
                 arrays. With more and more algebra libraries offloaded
                 to GPUs, a high performance in-place transposition
                 becomes necessary. Intuitively, in-place transposition
                 should be a good fit for GPU architectures due to
                 limited available on-board memory capacity and high
                 throughput. However, direct application of CPU in-place
                 transposition algorithms lacks the amount of
                 parallelism and locality required by GPUs to achieve
                 good performance. In this paper we present the first
                 known in-place matrix transposition approach for the
                 GPUs. Our implementation is based on a novel 3-stage
                 transposition algorithm where each stage is performed
                 using an elementary tiled-wise transposition.
                 Additionally, when transposition is done as part of the
                 memory transfer between GPU and host, our staged
                 approach allows hiding transposition overhead by
                 overlap with PCIe transfer. We show that the 3-stage
                 algorithm allows larger tiles and achieves 3X speedup
                 over a traditional 4-stage algorithm, with both
                 algorithms based on our high-performance elementary
                 transpositions on the GPU. We also show our proposed
                 low-level optimizations improve the sustained
                 throughput to more than 20 GB/s. Finally, we propose an
                 asynchronous execution scheme that allows CPU threads
                 to delegate in-place matrix transposition to GPU,
                 achieving a throughput of more than 3.4 GB/s (including
                 data transfers costs), and improving current
                 multithreaded implementations of in-place transposition
                 on CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Maleki:2014:PDP,
  author =       "Saeed Maleki and Madanlal Musuvathi and Todd
                 Mytkowicz",
  title =        "Parallelizing dynamic programming through rank
                 convergence",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "219--232",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555264",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper proposes an efficient parallel algorithm
                 for an important class of dynamic programming problems
                 that includes Viterbi, Needleman-Wunsch,
                 Smith-Waterman, and Longest Common Subsequence. In
                 dynamic programming, the subproblems that do not depend
                 on each other, and thus can be computed in parallel,
                 form stages or wavefronts. The algorithm presented in
                 this paper provides additional parallelism allowing
                 multiple stages to be computed in parallel despite
                 dependences among them. The correctness and the
                 performance of the algorithm relies on rank convergence
                 properties of matrix multiplication in the tropical
                 semiring, formed with plus as the multiplicative
                 operation and max as the additive operation. This paper
                 demonstrates the efficiency of the parallel algorithm
                 by showing significant speed ups on a variety of
                 important dynamic programming problems. In particular,
                 the parallel Viterbi decoder is up-to 24x faster (with
                 64 processors) than a highly optimized commercial
                 baseline.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Mehta:2014:RLF,
  author =       "Sanyam Mehta and Pei-Hung Lin and Pen-Chung Yew",
  title =        "Revisiting loop fusion in the polyhedral framework",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "233--246",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555250",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Loop fusion is an important compiler optimization for
                 improving memory hierarchy performance through enabling
                 data reuse. Traditional compilers have approached loop
                 fusion in a manner decoupled from other high-level loop
                 optimizations, missing several interesting solutions.
                 Recently, the polyhedral compiler framework with its
                 ability to compose complex transformations, has proved
                 to be promising in performing loop optimizations for
                 small programs. However, our experiments with large
                 programs using state-of-the-art polyhedral compiler
                 frameworks reveal suboptimal fusion partitions in the
                 transformed code. We trace the reason for this to be
                 lack of an effective cost model to choose a good fusion
                 partitioning among the possible choices, which increase
                 exponentially with the number of program statements. In
                 this paper, we propose a fusion algorithm to choose
                 good fusion partitions with two objective functions ---
                 achieving good data reuse and preserving parallelism
                 inherent in the source code. These objectives, although
                 targeted by previous work in traditional compilers,
                 pose new challenges within the polyhedral compiler
                 framework and have thus not been addressed. In our
                 algorithm, we propose several heuristics that work
                 effectively within the polyhedral compiler framework
                 and allow us to achieve the proposed objectives.
                 Experimental results show that our fusion algorithm
                 achieves performance comparable to the existing
                 polyhedral compilers for small kernel programs, and
                 significantly outperforms them for large benchmark
                 programs such as those in the SPEC benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Rodrigues:2014:TPS,
  author =       "Christopher Rodrigues and Thomas Jablin and Abdul
                 Dakkak and Wen-Mei Hwu",
  title =        "{Triolet}: a programming system that unifies
                 algorithmic skeleton interfaces for high-performance
                 cluster computing",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "247--258",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555268",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional algorithmic skeletons promise a high-level
                 programming interface for distributed-memory clusters
                 that free developers from concerns of task
                 decomposition, scheduling, and communication.
                 Unfortunately, prior distributed functional skeleton
                 frameworks do not deliver performance comparable to
                 that achievable in a low-level distributed programming
                 model such as C with MPI and OpenMP, even when used in
                 concert with high-performance array libraries. There
                 are several causes: they do not take advantage of
                 shared memory on each cluster node; they impose a fixed
                 partitioning strategy on input data; and they have
                 limited ability to fuse loops involving skeletons that
                 produce a variable number of outputs per input. We
                 address these shortcomings in the Triolet programming
                 language through a modular library design that
                 separates concerns of parallelism, loop nesting, and
                 data partitioning. We show how Triolet substantially
                 improves the parallel performance of algorithms
                 involving array traversals and nested, variable-size
                 loops over what is achievable in Eden, a distributed
                 variant of Haskell. We further demonstrate how Triolet
                 can substantially simplify parallel programming
                 relative to C with MPI and OpenMP while achieving
                 23--100\% of its performance on a 128-core cluster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Liu:2014:TAP,
  author =       "Xu Liu and John Mellor-Crummey",
  title =        "A tool to analyze the performance of multithreaded
                 programs on {NUMA} architectures",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "259--272",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555271",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Almost all of today's microprocessors contain memory
                 controllers and directly attach to memory. Modern
                 multiprocessor systems support non-uniform memory
                 access (NUMA): it is faster for a microprocessor to
                 access memory that is directly attached than it is to
                 access memory attached to another processor. Without
                 careful distribution of computation and data, a
                 multithreaded program running on such a system may have
                 high average memory access latency. To use
                 multiprocessor systems efficiently, programmers need
                 performance tools to guide the design of NUMA-aware
                 codes. To address this need, we enhanced the HPCToolkit
                 performance tools to support measurement and analysis
                 of performance problems on multiprocessor systems with
                 multiple NUMA domains. With these extensions,
                 HPCToolkit helps pinpoint, quantify, and analyze NUMA
                 bottlenecks in executions of multithreaded programs. It
                 computes derived metrics to assess the severity of
                 bottlenecks, analyzes memory accesses, and provides a
                 wealth of information to guide NUMA optimization,
                 including information about how to distribute data to
                 reduce access latency and minimize contention. This
                 paper describes the design and implementation of our
                 extensions to HPCToolkit. We demonstrate their utility
                 by describing case studies in which we use these
                 capabilities to diagnose NUMA bottlenecks in four
                 multithreaded applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Rao:2014:TFE,
  author =       "Jia Rao and Xiaobo Zhou",
  title =        "Towards fair and efficient {SMP} virtual machine
                 scheduling",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "273--286",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555246",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "As multicore processors become prevalent in modern
                 computer systems, there is a growing need for
                 increasing hardware utilization and exploiting the
                 parallelism of such platforms. With virtualization
                 technology, hardware utilization is improved by
                 encapsulating independent workloads into virtual
                 machines (VMs) and consolidating them onto the same
                 machine. SMP virtual machines have been widely adopted
                 to exploit parallelism. For virtualized systems, such
                 as a public cloud, fairness between tenants and the
                 efficiency of running their applications are keys to
                 success. However, we find that existing virtualization
                 platforms fail to enforce fairness between VMs with
                 different number of virtual CPUs (vCPU) that run on
                 multiple CPUs. We attribute the unfairness to the use
                 of per-CPU schedulers and the load imbalance on these
                 CPUs that incur inaccurate CPU allocations.
                 Unfortunately, existing approaches to reduce
                 unfairness, e.g., dynamic load balancing and CPU
                 capping, introduce significant inefficiencies to
                 parallel workloads. In this paper, we present Flex, a
                 vCPU scheduling scheme that enforces fairness at
                 VM-level and improves the efficiency of hosted parallel
                 applications. Flex centers on two key designs: (1)
                 dynamically adjusting vCPU weights (FlexW) on multiple
                 CPUs to achieve VM-level fairness and (2) flexibly
                 scheduling vCPUs (FlexS) to minimize wasted
                 busy-waiting time. We have implemented Flex in Xen and
                 performed comprehensive evaluations with various
                 parallel workloads. Results show that Flex is able to
                 achieve CPU allocations with on average no more than
                 5\% error compared to the ideal fair allocation.
                 Further, Flex outperforms Xen's credit scheduler and
                 two representative co-scheduling approaches by as much
                 as $ 10 \times $ for parallel applications using
                 busy-waiting or blocking synchronization methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Lu:2014:EDM,
  author =       "Kai Lu and Xu Zhou and Tom Bergan and Xiaoping Wang",
  title =        "Efficient deterministic multithreading without global
                 barriers",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "287--300",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555252",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multithreaded programs execute nondeterministically on
                 conventional architectures and operating systems. This
                 complicates many tasks, including debugging and
                 testing. Deterministic multithreading (DMT) makes the
                 output of a multithreaded program depend on its inputs
                 only, which can totally solve the above problem.
                 However, current DMT implementations suffer from a
                 common inefficiency: they use frequent global barriers
                 to enforce a deterministic ordering on memory accesses.
                 In this paper, we eliminate that inefficiency using an
                 execution model we call deterministic lazy release
                 consistency (DLRC). Our execution model uses the Kendo
                 algorithm to enforce a deterministic ordering on
                 synchronization, and it uses a deterministic version of
                 the lazy release consistency memory model to propagate
                 memory updates across threads. Our approach guarantees
                 that programs execute deterministically even when they
                 contain data races. We implemented a DMT system based
                 on these ideas (RFDet) and evaluated it using 16
                 parallel applications. Our implementation targets C/C++
                 programs that use POSIX threads. Results show that
                 RFDet gains nearly 2x speedup compared with DThreads-a
                 start-of-the-art DMT system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Eslamimehr:2014:RDS,
  author =       "Mahdi Eslamimehr and Jens Palsberg",
  title =        "Race directed scheduling of concurrent programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "301--314",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555263",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Detection of data races in Java programs remains a
                 difficult problem. The best static techniques produce
                 many false positives, and also the best dynamic
                 techniques leave room for improvement. We present a new
                 technique called race directed scheduling that for a
                 given race candidate searches for an input and a
                 schedule that lead to the race. The search iterates a
                 combination of concolic execution and schedule
                 improvement, and turns out to find useful inputs and
                 schedules efficiently. We use an existing technique to
                 produce a manageable number of race candidates. Our
                 experiments on 23 Java programs found 72 real races
                 that were missed by the best existing dynamic
                 techniques. Among those 72 races, 31 races were found
                 with schedules that have between 1 million and 108
                 million events, which suggests that they are rare and
                 hard-to-find races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Rubin:2014:HCW,
  author =       "Norm Rubin",
  title =        "Heterogeneous computing: what does it mean for
                 compiler research?",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "315--316",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2558891",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The current trend in computer architecture is to
                 increase the number of cores, to create specialized
                 types of cores within a single machine, and to network
                 such machines together in very fluid web/cloud
                 computing arrangements. Compilers have traditionally
                 focused on optimizations to code that improve
                 performance, but is that the right target to speed up
                 real applications? Consider loading a web page (like
                 starting GMAIL) the page is transferred to the client,
                 any JavaScript is compiled, the JavaScript executes,
                 and the page gets displayed. The classic compiler model
                 (which was first developed in the late 50's) was a
                 great fit for single core machines but has fallen
                 behind architecture, and language. For example how do
                 you compile a single program for a machine that has
                 both a CPU and a graphics coprocessor (a GPU) with a
                 very different programming and memory model? Together
                 with the changes in architecture there have been
                 changes in programming languages. Dynamic languages are
                 used more, static languages are used less. How does
                 this effect compiler research? In this talk, I'll
                 review a number of traditional compiler research
                 challenges that have (or will) become burning issues
                 and will describe some new problems areas that were not
                 considered in the past. For example language
                 specifications are large complex technical documents
                 that are difficult for non-experts to follow.
                 Application programmers are often not willing to read
                 these documents; can a compiler bridge the gap?",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Natarajan:2014:FCL,
  author =       "Aravind Natarajan and Neeraj Mittal",
  title =        "Fast concurrent lock-free binary search trees",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "317--328",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555256",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new lock-free algorithm for concurrent
                 manipulation of a binary search tree in an asynchronous
                 shared memory system that supports search, insert and
                 delete operations. In addition to read and write
                 instructions, our algorithm uses (single-word)
                 compare-and-swap (CAS) and bit-test-and-set (SETB)
                 atomic instructions, both of which are commonly
                 supported by many modern processors including Intel~64
                 and AMD64. In contrast to existing lock-free algorithms
                 for a binary search tree, our algorithm is based on
                 marking edges rather than nodes. As a result, when
                 compared to other lock-free algorithms, modify (insert
                 and delete) operations in our algorithm work on a
                 smaller portion of the tree, thereby reducing
                 conflicts, and execute fewer atomic instructions (one
                 for insert and three for delete). Our experiments
                 indicate that our lock-free algorithm significantly
                 outperforms all other algorithms for a concurrent
                 binary search tree in many cases, especially when
                 contention is high, by as much as 100\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Brown:2014:GTN,
  author =       "Trevor Brown and Faith Ellen and Eric Ruppert",
  title =        "A general technique for non-blocking trees",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "329--342",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555267",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a general technique for obtaining provably
                 correct, non-blocking implementations of a large class
                 of tree data structures where pointers are directed
                 from parents to children. Updates are permitted to
                 modify any contiguous portion of the tree atomically.
                 Our non-blocking algorithms make use of the LLX, SCX
                 and VLX primitives, which are multi-word
                 generalizations of the standard LL, SC and VL
                 primitives and have been implemented from single-word
                 CAS. To illustrate our technique, we describe how it
                 can be used in a fairly straightforward way to obtain a
                 non-blocking implementation of a chromatic tree, which
                 is a relaxed variant of a red-black tree. The height of
                 the tree at any time is O(c + log n), where n is the
                 number of keys and c is the number of updates in
                 progress. We provide an experimental performance
                 analysis which demonstrates that our Java
                 implementation of a chromatic tree rivals, and often
                 significantly outperforms, other leading concurrent
                 dictionaries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Drachsler:2014:PCB,
  author =       "Dana Drachsler and Martin Vechev and Eran Yahav",
  title =        "Practical concurrent binary search trees via logical
                 ordering",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "343--356",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555269",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present practical, concurrent binary search tree
                 (BST) algorithms that explicitly maintain logical
                 ordering information in the data structure, permitting
                 clean separation from its physical tree layout. We
                 capture logical ordering using intervals, with the
                 property that an item belongs to the tree if and only
                 if the item is an endpoint of some interval. We are
                 thus able to construct efficient, synchronization-free
                 and intuitive lookup operations. We present (i) a
                 concurrent non-balanced BST with a lock-free lookup,
                 and (ii) a concurrent AVL tree with a lock-free lookup
                 that requires no synchronization with any mutating
                 operations, including balancing operations. Our
                 algorithms apply on-time deletion; that is, every
                 request for removal of a node, results in its immediate
                 removal from the tree. This new feature did not exist
                 in previous concurrent internal tree algorithms. We
                 implemented our concurrent BST algorithms and evaluated
                 them against several state-of-the-art concurrent tree
                 algorithms. Our experimental results show that our
                 algorithms with lock-free contains and on-time deletion
                 are practical and often comparable to the
                 state-of-the-art.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Timnat:2014:PWF,
  author =       "Shahar Timnat and Erez Petrank",
  title =        "A practical wait-free simulation for lock-free data
                 structures",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "357--368",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555261",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Lock-free data structures guarantee overall system
                 progress, whereas wait-free data structures guarantee
                 the progress of each and every thread, providing the
                 desirable non-starvation guarantee for concurrent data
                 structures. While practical lock-free implementations
                 are known for various data structures, wait-free data
                 structure designs are rare. Wait-free implementations
                 have been notoriously hard to design and often
                 inefficient. In this work we present a transformation
                 of lock-free algorithms to wait-free ones allowing even
                 a non-expert to transform a lock-free data-structure
                 into a practical wait-free one. The transformation
                 requires that the lock-free data structure is given in
                 a normalized form defined in this work. Using the new
                 method, we have designed and implemented wait-free
                 linked-list, skiplist, and tree and we measured their
                 performance. It turns out that for all these data
                 structures the wait-free implementations are only a few
                 percent slower than their lock-free counterparts, while
                 still guaranteeing non-starvation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Pusukuri:2014:LCA,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi
                 Narayan Bhuyan",
  title =        "Lock contention aware thread migrations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "369--370",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555273",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "On a cache-coherent multicore multiprocessor system,
                 the performance of a multithreaded application with
                 high lock contention is very sensitive to the
                 distribution of application threads across multiple
                 processors. This is because the distribution of threads
                 impacts the frequency of lock transfers between
                 processors, which in turn impacts the frequency of
                 last-level cache (LLC) misses that lie on the critical
                 path of execution. Inappropriate distribution of
                 threads across processors increases LLC misses in the
                 critical path and significantly degrades performance of
                 multithreaded programs. To alleviate the above problem,
                 this paper overviews a thread migration technique,
                 which migrates threads of a multithreaded program
                 across multicore processors so that threads seeking
                 locks are more likely to find the locks on the same
                 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Lee:2014:IFL,
  author =       "Kyu Hyung Lee and Dohyeong Kim and Xiangyu Zhang",
  title =        "Infrastructure-free logging and replay of concurrent
                 execution on multiple cores",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "371--372",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555274",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We develop a logging and replay technique for real
                 concurrent execution on multiple cores. Our technique
                 directly works on binaries and does not require any
                 hardware or complex software infrastructure support. We
                 focus on minimizing logging overhead as it only logs a
                 subset of system calls and thread spawns. Replay is on
                 a single core. During replay, our technique first tries
                 to follow only the event order in the log. However, due
                 to schedule differences, replay may fail. An
                 exploration process is then triggered to search for a
                 schedule that allows the replay to make progress.
                 Exploration is performed within a window preceding the
                 point of replay failure. During exploration, our
                 technique first tries to reorder synchronized blocks.
                 If that does not lead to progress, it further reorders
                 shared variable accesses. The exploration is
                 facilitated by a sophisticated caching mechanism. Our
                 experiments on real world programs and real workload
                 show that the proposed technique has very low logging
                 overhead (2.6\% on average) and fast schedule
                 reconstruction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Aguston:2014:PHC,
  author =       "Cfir Aguston and Yosi Ben Asher and Gadi Haber",
  title =        "Parallelization hints via code skeletonization",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "373--374",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555275",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Tools that provide optimization hints for program
                 developers are facing severe obstacles and often unable
                 to provide meaningful guidance on how to parallelize
                 real--life applications. The main reason is due to the
                 high code complexity and its large size when
                 considering commercially valuable code. Such code is
                 often rich with pointers, heavily nested conditional
                 statements, nested while--based loops, function calls,
                 etc. These constructs prevent existing compiler
                 analysis from extracting the full parallelization
                 potential. We propose a new paradigm to overcome this
                 issue by automatically transforming the code into a
                 much simpler skeleton-like form that is more conductive
                 for auto-parallelization. We then apply existing tools
                 of source--level automatic parallelization on the
                 skeletonized code in order to expose possible
                 parallelization patterns. The skeleton code, along with
                 the parallelized version, are then provided to the
                 programmer in the form of an IDE (Integrated
                 Development Environment) recommendation. The proposed
                 skeletonization algorithm replaces pointers by integer
                 indexes and C-struct references by references to
                 multi-dimensional arrays. This is because automatic
                 parallelizers cannot handle pointer expressions. For
                 example, {\tt while(p != NULL)\{ p->val++; p=p->next;
                 \}} will be skeletonized to the parallelizable {\tt
                 for(Ip=0;Ip < N; Ip++) \{ Aval[Ip]++; \}} where {\tt
                 Aval[]} holds the embedding of the original list. It
                 follows that the main goal of the skeletonization
                 process is to embed pointer-based data structures into
                 arrays. Though the skeletonized code is not
                 semantically equivalent to the original code, it points
                 out a possible parallelization pattern for this code
                 segment and can be used as an effective parallelization
                 hint to the programmer. We applied the method on
                 several representative benchmarks from SPEC CPU 2000
                 and reached up to 80\% performance gain after several
                 sequential code segments had been manually parallelized
                 based on the parallelization patterns of the generated
                 skeletons. In a different set of experiments we tried
                 to estimate the potential of skeletonization for a
                 larger set of programs in SPEC 2000 and obtained an
                 estimation of 27\% additional loops that can be
                 parallelized/vectorized due to skeletonization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Wang:2014:CBL,
  author =       "Wenwen Wang and Chenggang Wu and Pen-Chung Yew and
                 Xiang Yuan and Zhenjiang Wang and Jianjun Li and
                 Xiaobing Feng",
  title =        "Concurrency bug localization using shared memory
                 access pairs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "375--376",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555276",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Non-determinism in concurrent programs makes their
                 debugging much more challenging than that in sequential
                 programs. To mitigate such difficulties, we propose a
                 new technique to automatically locate buggy shared
                 memory accesses that triggered concurrency bugs.
                 Compared to existing fault localization techniques that
                 are based on empirical statistical approaches, this
                 technique has two advantages. First, as long as enough
                 successful runs of a concurrent program are collected,
                 the proposed technique can locate buggy memory accesses
                 to the shared data even with only one single failed run
                 captured, as opposed to the need of capturing multiple
                 failed runs in other statistical approaches. Second,
                 the proposed technique is more precise because it
                 considers memory accesses in those failed runs that
                 terminate prematurely.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Leung:2014:TMS,
  author =       "Vitus J. Leung and David P. Bunde and Jonathan Ebbers
                 and Stefan P. Feer and Nickolas W. Price and Zachary D.
                 Rhodes and Matthew Swank",
  title =        "Task mapping stencil computations for non-contiguous
                 allocations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "377--378",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555277",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We examine task mapping algorithms for systems that
                 allocate jobs non-contiguously. Several studies have
                 shown that task placement affects job running time. We
                 focus on jobs with a stencil communication pattern and
                 use experiments on a Cray XE to evaluate novel task
                 mapping algorithms as well as some adapted to this
                 setting. This is done with the miniGhost miniApp which
                 mimics the performance of CTH, a shock physics
                 application. Our strategies improve average and
                 single-run times by as much as 28\% and 36\% over a
                 baseline strategy, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Wimmer:2014:DST,
  author =       "Martin Wimmer and Francesco Versaci and Jesper Larsson
                 Tr{\"a}ff and Daniel Cederman and Philippas Tsigas",
  title =        "Data structures for task-based priority scheduling",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "379--380",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555278",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present three lock-free data structures for
                 priority task scheduling: a priority work-stealing one,
                 a centralized one with \rho -relaxed semantics, and a
                 hybrid one combining both concepts. With the
                 single-source shortest path (SSSP) problem as example,
                 we show how the different approaches affect the
                 prioritization and provide upper bounds on the number
                 of examined nodes. We argue that priority task
                 scheduling allows for an intuitive and easy way to
                 parallelize the SSSP problem, notoriously a hard task.
                 Experimental evidence supports the good scalability of
                 the resulting algorithm. The larger aim of this work is
                 to understand the trade-offs between scalability and
                 priority guarantees in task scheduling systems. We show
                 that \rho -relaxation is a valuable technique for
                 improving the first, while still allowing semantic
                 constraints to be satisfied: the lock-free, hybrid
                 $k$-priority data structure can scale as well as
                 work-stealing, while still providing strong priority
                 scheduling guarantees, which depend on the parameter k.
                 Our theoretical results open up possibilities for even
                 more scalable data structures by adopting a weaker form
                 of \rho -relaxation, which still enables the semantic
                 constraints to be respected.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Gomez:2014:DSD,
  author =       "Leonardo Bautista Gomez and Franck Cappello",
  title =        "Detecting silent data corruption through data dynamic
                 monitoring for scientific applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "381--382",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555279",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parallel programming has become one of the best ways
                 to express scientific models that simulate a wide range
                 of natural phenomena. These complex parallel codes are
                 deployed and executed on large-scale parallel
                 computers, making them important tools for scientific
                 discovery. As supercomputers get faster and larger, the
                 increasing number of components is leading to higher
                 failure rates. In particular, the miniaturization of
                 electronic components is expected to lead to a dramatic
                 rise in soft errors and data corruption. Moreover, soft
                 errors can corrupt data silently and generate large
                 inaccuracies or wrong results at the end of the
                 computation. In this paper we propose a novel technique
                 to detect silent data corruption based on data
                 monitoring. Using this technique, an application can
                 learn the normal dynamics of its datasets, allowing it
                 to quickly spot anomalies. We evaluate our technique
                 with synthetic benchmarks and we show that our
                 technique can detect up to 50\% of injected errors
                 while incurring only negligible overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Sandes:2014:FGP,
  author =       "Edans F. de O. Sandes and Guillermo Miranda and Alba
                 C. M. A. Melo and Xavier Martorell and Eduard Ayguade",
  title =        "Fine-grain parallel megabase sequence comparison with
                 multiple heterogeneous {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "383--384",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555280",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper proposes and evaluates a parallel strategy
                 to execute the exact Smith-Waterman (SW) algorithm for
                 megabase DNA sequences in heterogeneous multi-GPU
                 platforms. In our strategy, the computation of a single
                 huge SW matrix is spread over multiple GPUs, which
                 communicate border elements to the neighbour, using a
                 circular buffer mechanism that hides the communication
                 overhead. We compared 4 pairs of human-chimpanzee
                 homologous chromosomes using 2 different GPU
                 environments, obtaining a performance of up to 140.36
                 GCUPS (Billion of cells processed per second) with 3
                 heterogeneous GPUS.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Golan-Gueta:2014:ASL,
  author =       "Guy Golan-Gueta and G. Ramalingam and Mooly Sagiv and
                 Eran Yahav",
  title =        "Automatic semantic locking",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "385--386",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555281",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we consider concurrent programs in
                 which the shared state consists of instances of
                 linearizable ADTs (abstract data types). We develop a
                 novel automated approach to concurrency control that
                 addresses a common need: the need to atomically execute
                 a code fragment, which may contain multiple ADT
                 operations on multiple ADT instances. In our approach,
                 each ADT implements ADT-specific semantic locking
                 operations that serve to exploit the semantics of ADT
                 operations. We develop a synthesis algorithm that
                 automatically inserts calls to these locking operations
                 in a set of given code fragments (in a client program)
                 to ensure that these code fragments execute atomically
                 without deadlocks, and without rollbacks. We have
                 implemented the synthesis algorithm and several
                 general-purpose ADTs with semantic locking. We have
                 applied the synthesis algorithm to several Java
                 programs that use these ADTs. Our results show that our
                 approach enables efficient and scalable
                 synchronization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Hassan:2014:OTB,
  author =       "Ahmed Hassan and Roberto Palmieri and Binoy
                 Ravindran",
  title =        "Optimistic transactional boosting",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "387--388",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555283",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Herlihy and Koskinen's transactional boosting
                 methodology addressed the challenge of converting
                 concurrent data structures into transactional ones. We
                 present an optimistic methodology for boosting
                 concurrent collections. Optimistic boosting allows
                 greater data structure-specific optimizations, easier
                 integration with STM frameworks, and lower restrictions
                 on the boosted operations than the original boosting
                 methodology.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Agrawal:2014:PGS,
  author =       "Kunal Agrawal and Jeremy T. Fineman and Brendan
                 Sheridan and Jim Sukha and Robert Utterback",
  title =        "Provably good scheduling for parallel programs that
                 use data structures through implicit batching",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "389--390",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555284",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This poster proposes an efficient runtime scheduler
                 that provides provable performance guarantees to
                 parallel programs that use data structures through the
                 use of implicit batching.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Ma:2014:TAC,
  author =       "Lin Ma and Kunal Agrawal and Roger D. Chamberlain",
  title =        "Theoretical analysis of classic algorithms on
                 highly-threaded many-core {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "391--392",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555285",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Threaded many-core memory (TMM) model provides a
                 framework to analyze the performance of algorithms on
                 GPUs. Here, we investigate the effectiveness of the TMM
                 model by analyzing algorithms for 3 classic problems
                 --- suffix tree/array for string matching, fast Fourier
                 transform, and merge sort --- under this model. Our
                 findings indicate that the TMM model can explain and
                 predict previously unexplained trends and artifacts in
                 experimental data.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Tomkins:2014:SIP,
  author =       "Daniel Tomkins and Timmie Smith and Nancy M. Amato and
                 Lawrence Rauchwerger",
  title =        "{SCCMulti}: an improved parallel strongly connected
                 components algorithm",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "393--394",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555286",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Tarjan's famous linear time, sequential algorithm for
                 finding the strongly connected components (SCCs) of a
                 graph relies on depth first search, which is inherently
                 sequential. Deterministic parallel algorithms solve
                 this problem in logarithmic time using matrix
                 multiplication techniques, but matrix multiplication
                 requires a large amount of total work. Randomized
                 algorithms based on reachability --- the ability to get
                 from one vertex to another along a directed path ---
                 greatly improve the work bound in the average case.
                 However, these algorithms do not always perform well;
                 for instance, Divide-and-Conquer Strong Components
                 (DCSC), a scalable, divide-and-conquer algorithm, has
                 good expected theoretical limits, but can perform very
                 poorly on graphs for which the maximum reachability of
                 any vertex is small. A related algorithm, MultiPivot,
                 gives very high probability guarantees on the total
                 amount of work for all graphs, but this improvement
                 introduces an overhead that increases the average
                 running time. This work introduces SCCMulti, a
                 multi-pivot improvement of DCSC that offers the same
                 consistency as MultiPivot without the time overhead. We
                 provide experimental results demonstrating SCCMulti's
                 scalability; these results also show that SCCMulti is
                 more consistent than DCSC and is always faster than
                 MultiPivot.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Luo:2014:ISM,
  author =       "Miao Luo and Xiaoyi Lu and Khaled Hamidouche and
                 Krishna Kandalla and Dhabaleswar K. Panda",
  title =        "Initial study of multi-endpoint runtime for {MPI +
                 OpenMP} hybrid programming model on multi-core
                 systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "395--396",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555287",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "State-of-the-art MPI libraries rely on locks to
                 guarantee thread-safety. This discourages application
                 developers from using multiple threads to perform MPI
                 operations. In this paper, we propose a high
                 performance, lock-free multi-endpoint MPI runtime,
                 which can achieve up to 40\% improvement for
                 point-to-point operation and one representative
                 collective operation with minimum or no modifications
                 to the existing applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Isaacs:2014:ELS,
  author =       "Katherine E. Isaacs and Todd Gamblin and Abhinav
                 Bhatele and Peer-Timo Bremer and Martin Schulz and
                 Bernd Hamann",
  title =        "Extracting logical structure and identifying
                 stragglers in parallel execution traces",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "8",
  pages =        "397--398",
  month =        aug,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692916.2555288",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Nov 26 16:26:30 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a new approach to automatically extract
                 an idealized logical structure from a parallel
                 execution trace. We use this structure to define
                 intuitive metrics such as the lateness of a process
                 involved in a parallel execution. By analyzing and
                 illustrating traces in terms of logical steps, we
                 leverage a developer's understanding of the
                 happened-before relations in a parallel program. This
                 technique can uncover dependency chains, elucidate
                 communication patterns, and highlight sources and
                 propagation of delays, all of which may be obscured in
                 a traditional trace visualization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '14 conference proceedings.",
}

@Article{Fisher:2014:UFM,
  author =       "Kathleen Fisher",
  title =        "Using formal methods to enable more secure vehicles:
                 {DARPA}'s {HACMS} program",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "1--1",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628165",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Networked embedded systems are ubiquitous in modern
                 society. Examples include SCADA systems that manage
                 physical infrastructure, medical devices such as
                 pacemakers and insulin pumps, and vehicles such as
                 airplanes and automobiles. Such devices are connected
                 to networks for a variety of compelling reasons,
                 including the ability to access diagnostic information
                 conveniently, perform software updates, provide
                 innovative features, and lower costs. Researchers and
                 hackers have shown that these kinds of networked
                 embedded systems are vulnerable to remote attacks and
                 that such attacks can cause physical damage and can be
                 hidden from monitors [1, 4]. DARPA launched the HACMS
                 program to create technology to make such systems
                 dramatically harder to attack successfully.
                 Specifically, HACMS is pursuing a clean-slate, formal
                 methods-based approach to the creation of
                 high-assurance vehicles, where high assurance is
                 defined to mean functionally correct and satisfying
                 appropriate safety and security properties. Specific
                 technologies include program synthesis, domain-specific
                 languages, and theorem provers used as program
                 development environments. Targeted software includes
                 operating system components such as hypervisors,
                 microkernels, file systems, and device drivers as well
                 as control systems such as autopilots and adaptive
                 cruise controls. Program researchers are leveraging
                 existing high-assurance software including NICTA's seL4
                 microkernel and INRIA's CompCert compiler. Although the
                 HACMS project is less than halfway done, the program
                 has already achieved some remarkable success. At
                 program kick-off, a Red Team easily hijacked the
                 baseline open-source quadcopter that HACMS researchers
                 are using as a research platform. At the end of
                 eighteen months, the Red Team was not able to hijack
                 the newly-minted ``SMACCMCopter'' running
                 high-assurance HACMS code, despite being given six
                 weeks and full access to the source code of the copter.
                 An expert in penetration testing called the
                 SMACCMCopter ``the most secure UAV on the planet.'' In
                 this talk, I will describe the HACMS program: its
                 motivation, the underlying technologies, current
                 results, and future directions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Hickey:2014:BES,
  author =       "Patrick C. Hickey and Lee Pike and Trevor Elliott and
                 James Bielman and John Launchbury",
  title =        "Building embedded systems with embedded {DSLs}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "3--9",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628146",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We report on our experiences in synthesizing a
                 fully-featured autopilot from embedded domain-specific
                 languages (EDSLs) hosted in Haskell. The autopilot is
                 approximately 50k lines of C code generated from 10k
                 lines of EDSL code and includes control laws, mode
                 logic, encrypted communications system, and device
                 drivers. The autopilot was built in less than two
                 engineer years. This is the story of how EDSLs provided
                 the productivity and safety gains to do large-scale
                 low-level embedded programming and lessons we learned
                 in doing so.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Schlesinger:2014:CNP,
  author =       "Cole Schlesinger and Michael Greenberg and David
                 Walker",
  title =        "Concurrent {NetCore}: from policies to pipelines",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "11--24",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628157",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In a Software-Defined Network (SDN), a central,
                 computationally powerful controller manages a set of
                 distributed, computationally simple switches. The
                 controller computes a policy describing how each switch
                 should route packets and populates packet-processing
                 tables on each switch with rules to enact the routing
                 policy. As network conditions change, the controller
                 continues to add and remove rules from switches to
                 adjust the policy as needed. Recently, the SDN
                 landscape has begun to change as several proposals for
                 new, reconfigurable switching architectures, such as
                 RMT [5] and FlexPipe [14] have emerged. These platforms
                 provide switch programmers with many, flexible tables
                 for storing packet-processing rules, and they offer
                 programmers control over the packet fields that each
                 table can analyze and act on. These reconfigurable
                 switch architectures support a richer SDN model in
                 which a switch configuration phase precedes the rule
                 population phase [4]. In the configuration phase, the
                 controller sends the switch a graph describing the
                 layout and capabilities of the packet processing tables
                 it will require during the population phase. Armed with
                 this foreknowledge, the switch can allocate its
                 hardware (or software) resources more efficiently. We
                 present a new, typed language, called Concurrent
                 NetCore, for specifying routing policies and graphs of
                 packet-processing tables. Concurrent NetCore includes
                 features for specifying sequential, conditional and
                 concurrent control-flow between packet-processing
                 tables. We develop a fine-grained operational model for
                 the language and prove this model coincides with a
                 higher-level denotational model when programs are
                 well-typed. We also prove several additional properties
                 of well-typed programs, including strong normalization
                 and determinism. To illustrate the utility of the
                 language, we develop linguistic models of both the RMT
                 and FlexPipe architectures and we give a multi-pass
                 compilation algorithm that translates graphs and
                 routing policies to the RMT model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Schoepe:2014:STI,
  author =       "Daniel Schoepe and Daniel Hedin and Andrei Sabelfeld",
  title =        "{SeLINQ}: tracking information across
                 application-database boundaries",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "25--38",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628151",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "The root cause for confidentiality and integrity
                 attacks against computing systems is insecure
                 information flow. The complexity of modern systems
                 poses a major challenge to secure end-to-end
                 information flow, ensuring that the insecurity of a
                 single component does not render the entire system
                 insecure. While information flow in a variety of
                 languages and settings has been thoroughly studied in
                 isolation, the problem of tracking information across
                 component boundaries has been largely out of reach of
                 the work so far. This is unsatisfactory because
                 tracking information across component boundaries is
                 necessary for end-to-end security. This paper proposes
                 a framework for uniform tracking of information flow
                 through both the application and the underlying
                 database. Key enabler of the uniform treatment is
                 recent work by Cheney et al., which studies database
                 manipulation via an embedded language-integrated query
                 language (with Microsoft's LINQ on the backend).
                 Because both the host language and the embedded query
                 languages are functional F\#-like languages, we are
                 able to leverage information-flow enforcement for
                 functional languages to obtain information-flow control
                 for databases ``for free'', synergize it with
                 information-flow control for applications and thus
                 guarantee security across application-database
                 boundaries. We develop the formal results in the form
                 of a security type system that includes a treatment of
                 algebraic data types and pattern matching, and
                 establish its soundness. On the practical side, we
                 implement the framework and demonstrate its usefulness
                 in a case study with a realistic movie rental
                 database.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Chen:2014:TBP,
  author =       "Sheng Chen and Martin Erwig",
  title =        "Type-based parametric analysis of program families",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "39--51",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628155",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Previous research on static analysis for program
                 families has focused on lifting analyses for single,
                 plain programs to program families by employing
                 idiosyncratic representations. The lifting effort
                 typically involves a significant amount of work for
                 proving the correctness of the lifted algorithm and
                 demonstrating its scalability. In this paper, we
                 propose a parameterized static analysis framework for
                 program families that can automatically lift a class of
                 type-based static analyses for plain programs to
                 program families. The framework consists of a
                 parametric logical specification and a parametric
                 variational constraint solver. We prove that a lifted
                 algorithm is correct provided that the underlying
                 analysis algorithm is correct. An evaluation of our
                 framework has revealed an error in a previous manually
                 lifted analysis. Moreover, performance tests indicate
                 that the overhead incurred by the general framework is
                 bounded by a factor of 2.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Stansifer:2014:RSM,
  author =       "Paul Stansifer and Mitchell Wand",
  title =        "{Romeo}: a system for more flexible binding-safe
                 programming",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "53--65",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628162",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Current languages for safely manipulating values with
                 names only support term languages with simple binding
                 syntax. As a result, no tools exist to safely
                 manipulate code written in those languages for which
                 name problems are the most challenging. We address this
                 problem with Romeo, a language that respects $ \alpha
                 $-equivalence on its values, and which has access to a
                 rich specification language for binding, inspired by
                 attribute grammars. Our work has the complex-binding
                 support of David Herman's $ \lambda_m$, but is a
                 full-fledged binding-safe language like Pure FreshML.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Grabmayer:2014:MSL,
  author =       "Clemens Grabmayer and Jan Rochel",
  title =        "Maximal sharing in the {Lambda} calculus with letrec",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "67--80",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628148",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Increasing sharing in programs is desirable to
                 compactify the code, and to avoid duplication of
                 reduction work at run-time, thereby speeding up
                 execution. We show how a maximal degree of sharing can
                 be obtained for programs expressed as terms in the
                 lambda calculus with letrec. We introduce a notion of
                 `maximal compactness' for $ \lambda_{\rm letrec}$-terms
                 among all terms with the same infinite unfolding.
                 Instead of defined purely syntactically, this notion is
                 based on a graph semantics. $ \lambda_{\rm
                 letrec}$-terms are interpreted as first-order term
                 graphs so that unfolding equivalence between terms is
                 preserved and reflected through bisimilarity of the
                 term graph interpretations. Compactness of the term
                 graphs can then be compared via functional
                 bisimulation. We describe practical and efficient
                 methods for the following two problems: transforming a
                 $ \lambda_{\rm letrec}$-term into a maximally compact
                 form; and deciding whether two $ \lambda_{\rm
                 letrec}$-terms are unfolding-equivalent. The
                 transformation of a $ \lambda_{\rm letrec}$-terms $L$
                 into maximally compact form $ L_0$ proceeds in three
                 steps: (i) translate $L$ into its term graph $ G =
                 [[L]]$; (ii) compute the maximally shared form of $G$
                 as its bisimulation collapse $ G_0$; (iii) read back a
                 $ \lambda_{\rm letrec}$-term $ L_0$ from the term graph
                 $ G_0$ with the property $ [[L_0]] = G_0$. Then $ L_0$
                 represents a maximally shared term graph, and it has
                 the same unfolding as $L$. The procedure for deciding
                 whether two given $ \lambda_{\rm letrec}$-terms $ L_1$
                 and $ L_2$ are unfolding-equivalent computes their term
                 graph interpretations $ [[L_1]]$ and $ [[L_2]]$, and
                 checks whether these are bisimilar. For illustration,
                 we also provide a readily usable implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Bergstrom:2014:PEH,
  author =       "Lars Bergstrom and Matthew Fluet and Matthew Le and
                 John Reppy and Nora Sandler",
  title =        "Practical and effective higher-order optimizations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "81--93",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628153",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Inlining is an optimization that replaces a call to a
                 function with that function's body. This optimization
                 not only reduces the overhead of a function call, but
                 can expose additional optimization opportunities to the
                 compiler, such as removing redundant operations or
                 unused conditional branches. Another optimization, copy
                 propagation, replaces a redundant copy of a still-live
                 variable with the original. Copy propagation can reduce
                 the total number of live variables, reducing register
                 pressure and memory usage, and possibly eliminating
                 redundant memory-to-memory copies. In practice, both of
                 these optimizations are implemented in nearly every
                 modern compiler. These two optimizations are practical
                 to implement and effective in first-order languages,
                 but in languages with lexically-scoped first-class
                 functions (aka, closures), these optimizations are not
                 available to code programmed in a higher-order style.
                 With higher-order functions, the analysis challenge has
                 been that the environment at the call site must be the
                 same as at the closure capture location, up to the free
                 variables, or the meaning of the program may change.
                 Olin Shivers' 1991 dissertation called this family of
                 optimizations super $ \Beta $ and he proposed one
                 analysis technique, called reflow, to support these
                 optimizations. Unfortunately, reflow has proven too
                 expensive to implement in practice. Because these
                 higher-order optimizations are not available in
                 functional-language compilers, programmers studiously
                 avoid uses of higher-order values that cannot be
                 optimized (particularly in compiler benchmarks). This
                 paper provides the first practical and effective
                 technique for super $ \Beta $ (higher-order) inlining
                 and copy propagation, which we call unchanged variable
                 analysis. We show that this technique is practical by
                 implementing it in the context of a real compiler for
                 an ML-family language and showing that the required
                 analyses have costs below 3\% of the total compilation
                 time. This technique's effectiveness is shown through a
                 set of benchmarks and example programs, where this
                 analysis exposes additional potential optimization
                 sites.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Hackett:2014:WWM,
  author =       "Jennifer Hackett and Graham Hutton",
  title =        "Worker\slash wrapper\slash makes it\slash faster",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "95--107",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628142",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Much research in program optimization has focused on
                 formal approaches to correctness: proving that the
                 meaning of programs is preserved by the optimisation.
                 Paradoxically, there has been comparatively little work
                 on formal approaches to efficiency: proving that the
                 performance of optimized programs is actually improved.
                 This paper addresses this problem for a general-purpose
                 optimization technique, the worker/wrapper
                 transformation. In particular, we use the call-by-need
                 variant of improvement theory to establish conditions
                 under which the worker/wrapper transformation is
                 formally guaranteed to preserve or improve the time
                 performance of programs in lazy languages such as
                 Haskell.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Downen:2014:CSC,
  author =       "Paul Downen and Zena M. Ariola",
  title =        "Compositional semantics for composable continuations:
                 from abortive to delimited control",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "109--122",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628147",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parigot's $ \lambda \mu $-calculus, a system for
                 computational reasoning about classical proofs, serves
                 as a foundation for control operations embodied by
                 operators like Scheme's callcc. We demonstrate that the
                 call-by-value theory of the $ \lambda \mu $-calculus
                 contains a latent theory of delimited control, and that
                 a known variant of $ \lambda \mu $ which unshackles the
                 syntax yields a calculus of composable continuations
                 from the existing constructs and rules for classical
                 control. To relate to the various formulations of
                 control effects, and to continuation-passing style, we
                 use a form of compositional program transformations
                 which preserves the underlying structure of equational
                 theories, contexts, and substitution. Finally, we
                 generalize the call-by-name and call-by-value theories
                 of the $ \lambda \mu $-calculus by giving a single
                 parametric theory that encompasses both, allowing us to
                 generate a call-by-need instance that defines a
                 calculus of classical and delimited control with lazy
                 evaluation and sharing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Petricek:2014:CCC,
  author =       "Tomas Petricek and Dominic Orchard and Alan Mycroft",
  title =        "Coeffects: a calculus of context-dependent
                 computation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "123--135",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628160",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The notion of context in functional languages no
                 longer refers just to variables in scope. Context can
                 capture additional properties of variables (usage
                 patterns in linear logics; caching requirements in
                 dataflow languages) as well as additional resources or
                 properties of the execution environment (rebindable
                 resources; platform version in a cross-platform
                 application). The recently introduced notion of
                 coeffects captures the latter, whole-context
                 properties, but it failed to capture fine-grained
                 per-variable properties. We remedy this by developing a
                 generalized coeffect system with annotations indexed by
                 a coeffect shape. By instantiating a concrete shape,
                 our system captures previously studied flat
                 (whole-context) coeffects, but also structural
                 (per-variable) coeffects, making coeffect analyses more
                 useful. We show that the structural system enjoys
                 desirable syntactic properties and we give a
                 categorical semantics using extended notions of indexed
                 comonad. The examples presented in this paper are based
                 on analysis of established language features (liveness,
                 linear logics, dataflow, dynamic scoping) and we argue
                 that such context-aware properties will also be useful
                 for future development of languages for increasingly
                 heterogeneous and distributed platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Findler:2014:BSC,
  author =       "Robert Bruce Findler",
  title =        "Behavioral software contracts",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "137--138",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2632855",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmers embrace contracts. They can use the
                 language they know and love to formulate logical
                 assertions about the behavior of their programs. They
                 can use the existing IDE infrastructure to log
                 contracts, to test, to debug, and to profile their
                 programs. The keynote presents the challenges and
                 rewards of supporting contracts in a modern,
                 full-spectrum programming language. It covers technical
                 challenges of contracts while demonstrating the
                 non-technical motivation for contract system design
                 choices and showing how contracts and contract research
                 can serve practicing programmers. The remainder of this
                 article is a literature survey of contract research,
                 with an emphasis on recent work about higher-order
                 contracts and blame.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Nguyen:2014:SCV,
  author =       "Ph{\'u}c C. Nguyen and Sam Tobin-Hochstadt and David
                 {Van Horn}",
  title =        "Soft contract verification",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "139--152",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628156",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Behavioral software contracts are a widely used
                 mechanism for governing the flow of values between
                 components. However, run-time monitoring and
                 enforcement of contracts imposes significant overhead
                 and delays discovery of faulty components to run-time.
                 To overcome these issues, we present soft contract
                 verification, which aims to statically prove either
                 complete or partial contract correctness of components,
                 written in an untyped, higher-order language with
                 first-class contracts. Our approach uses higher-order
                 symbolic execution, leveraging contracts as a source of
                 symbolic values including unknown behavioral values,
                 and employs an updatable heap of contract invariants to
                 reason about flow-sensitive facts. We prove the
                 symbolic execution soundly approximates the dynamic
                 semantics and that verified programs can't be blamed.
                 The approach is able to analyze first-class contracts,
                 recursive data structures, unknown functions, and
                 control-flow-sensitive refinements of values, which are
                 all idiomatic in dynamic languages. It makes effective
                 use of an off-the-shelf solver to decide problems
                 without heavy encodings. The approach is competitive
                 with a wide range of existing tools --- including type
                 systems, flow analyzers, and model checkers --- on
                 their own benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Ramsey:2014:THD,
  author =       "Norman Ramsey",
  title =        "On teaching *how to design programs*: observations
                 from a newcomer",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "153--166",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628137",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a personal, qualitative case study
                 of a first course using How to Design Programs and its
                 functional teaching languages. The paper
                 reconceptualizes the book's six-step design process as
                 an eight-step design process ending in a new ``review
                 and refactor'' step. It recommends specific approaches
                 to students' difficulties with function descriptions,
                 function templates, data examples, and other parts of
                 the design process. It connects the process to
                 interactive ``world programs.'' It recounts
                 significant, informative missteps in course design and
                 delivery. Finally, it identifies some unsolved teaching
                 problems and some potential solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Ohori:2014:SIP,
  author =       "Atsushi Ohori and Katsuhiro Ueno and Kazunori Hoshi
                 and Shinji Nozaki and Takashi Sato and Tasuku Makabe
                 and Yuki Ito",
  title =        "{SML\#} in industry: a practical {ERP} system
                 development",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "167--173",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628164",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper reports on our industry-academia project of
                 using a functional language in business software
                 production. The general motivation behind the project
                 is our ultimate goal of adopting an ML-style
                 higher-order typed functional language in a wide range
                 of ordinary software development in industry. To probe
                 the feasibility and identify various practical problems
                 and needs, we have conducted a 15 month pilot project
                 for developing an enterprise resource planning (ERP)
                 system in SML\#. The project has successfully completed
                 as we have planned, demonstrating the feasibility of
                 SML\#. In particular, seamless integration of SQL and
                 direct C language interface are shown to be useful in
                 reliable and efficient development of a data intensive
                 business application. During the program development,
                 we have found several useful functional programming
                 patterns and a number of possible extensions of an
                 ML-style language with records. This paper reports on
                 the project details and the lessons learned from the
                 project.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Mulligan:2014:LRE,
  author =       "Dominic P. Mulligan and Scott Owens and Kathryn E.
                 Gray and Tom Ridge and Peter Sewell",
  title =        "{Lem}: reusable engineering of real-world semantics",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "175--188",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628143",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent years have seen remarkable successes in
                 rigorous engineering: using mathematically rigorous
                 semantic models (not just idealised calculi) of
                 real-world processors, programming languages,
                 protocols, and security mechanisms, for testing, proof,
                 analysis, and design. Building these models is
                 challenging, requiring experimentation, dialogue with
                 vendors or standards bodies, and validation; their
                 scale adds engineering issues akin to those of
                 programming to the task of writing clear and usable
                 mathematics. But language and tool support for
                 specification is lacking. Proof assistants can be used
                 but bring their own difficulties, and a model produced
                 in one, perhaps requiring many person-years effort and
                 maintained over an extended period, cannot be used by
                 those familiar with another. We introduce Lem, a
                 language for engineering reusable large-scale semantic
                 models. The Lem design takes inspiration both from
                 functional programming languages and from proof
                 assistants, and Lem definitions are translatable into
                 OCaml for testing, Coq, HOL4, and Isabelle/HOL for
                 proof, and LaTeX and HTML for presentation. This
                 requires a delicate balance of expressiveness, careful
                 library design, and implementation of transformations
                 --- akin to compilation, but subject to the constraint
                 of producing usable and human-readable code for each
                 target. Lem's effectiveness is demonstrated by its use
                 in practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Breitner:2014:SZC,
  author =       "Joachim Breitner and Richard A. Eisenberg and Simon
                 Peyton Jones and Stephanie Weirich",
  title =        "Safe zero-cost coercions for {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "189--202",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628141",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Generative type abstractions --- present in Haskell,
                 OCaml, and other languages --- are useful concepts to
                 help prevent programmer errors. They serve to create
                 new types that are distinct at compile time but share a
                 run-time representation with some base type. We present
                 a new mechanism that allows for zero-cost conversions
                 between generative type abstractions and their
                 representations, even when such types are deeply
                 nested. We prove type safety in the presence of these
                 conversions and have implemented our work in GHC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Pottier:2014:HME,
  author =       "Fran{\c{c}}ois Pottier",
  title =        "{Hindley--Milner} elaboration in applicative style:
                 functional pearl",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "203--212",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628145",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Type inference --- the problem of determining whether
                 a program is well-typed --- is well-understood. In
                 contrast, elaboration --- the task of constructing an
                 explicitly-typed representation of the program ---
                 seems to have received relatively little attention,
                 even though, in a non-local type inference system, it
                 is non-trivial. We show that the constraint-based
                 presentation of Hindley--Milner type inference can be
                 extended to deal with elaboration, while preserving its
                 elegance. This involves introducing a new notion of
                 ``constraint with a value'', which forms an applicative
                 functor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Winograd-Cort:2014:SNI,
  author =       "Daniel Winograd-Cort and Paul Hudak",
  title =        "Settable and non-interfering signal functions for
                 {FRP}: how a first-order switch is more than enough",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "213--225",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628140",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional Reactive Programming (FRP) provides a
                 method for programming continuous, reactive systems by
                 utilizing signal functions that, abstractly, transform
                 continuous input signals into continuous output
                 signals. These signals may also be streams of events,
                 and indeed, by allowing signal functions themselves to
                 be the values carried by these events (in essence,
                 signals of signal functions), one can conveniently make
                 discrete changes in program behavior by ``switching''
                 into and out of these signal functions. This
                 higher-order notion of switching is common among many
                 FRP systems, in particular those based on arrows, such
                 as Yampa. Although convenient, the power of switching
                 is often an overkill and can pose problems for certain
                 types of program optimization (such as causal
                 commutative arrows [14]), as it causes the structure of
                 the program to change dynamically at run-time. Without
                 a notion of just-in-time compilation or related idea,
                 which itself is beset with problems, such optimizations
                 are not possible at compile time. This paper introduces
                 two new ideas that obviate, in a predominance of cases,
                 the need for switching. The first is a non-interference
                 law for arrows with choice that allows an arrowized FRP
                 program to dynamically alter its own structure (within
                 statically limited bounds) as well as abandon unused
                 streams. The other idea is a notion of a settable
                 signal function that allows a signal function to
                 capture its present state and later be restarted from
                 some previous state. With these two features, canonical
                 uses of higher-order switchers can be replaced with a
                 suitable first-order design, thus enabling a broader
                 range of static optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Chen:2014:FPD,
  author =       "Yan Chen and Umut A. Acar and Kanat Tangwongsan",
  title =        "Functional programming for dynamic and large data with
                 self-adjusting computation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "227--240",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628150",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Combining type theory, language design, and empirical
                 work, we present techniques for computing with large
                 and dynamically changing datasets. Based on lambda
                 calculus, our techniques are suitable for expressing a
                 diverse set of algorithms on large datasets and, via
                 self-adjusting computation, enable computations to
                 respond automatically to changes in their data. To
                 improve the scalability of self-adjusting computation,
                 we present a type system for precise dependency
                 tracking that minimizes the time and space for storing
                 dependency metadata. The type system eliminates an
                 important assumption of prior work that can lead to
                 recording spurious dependencies. We present a
                 type-directed translation algorithm that generates
                 correct self-adjusting programs without relying on this
                 assumption. We then show a probabilistic-chunking
                 technique to further decrease space usage by
                 controlling the fundamental space-time tradeoff in
                 self-adjusting computation. We implement and evaluate
                 these techniques, showing promising results on
                 challenging benchmarks involving large graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Weirich:2014:DT,
  author =       "Stephanie Weirich",
  title =        "Depending on types",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "241--241",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2631168",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Is Haskell a dependently typed programming language?
                 Should it be? GHC's many type-system features, such as
                 Generalized Algebraic Datatypes (GADTs), datatype
                 promotion, multiparameter type classes, and type
                 families, give programmers the ability to encode
                 domain-specific invariants in their types. Clever
                 Haskell programmers have used these features to enhance
                 the reasoning capabilities of static type checking. But
                 really, how far have we come? Could we do more? In this
                 talk, I will discuss dependently typed programming in
                 Haskell, through examples, analysis and comparisons
                 with modern full-spectrum dependently typed languages,
                 such as Coq, Agda and Idris. What sorts of dependently
                 typed programming can be done in Haskell now? What
                 could GHC learn from these languages? Conversely, what
                 lessons can GHC offer in return?",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Angiuli:2014:HPT,
  author =       "Carlo Angiuli and Edward Morehouse and Daniel R.
                 Licata and Robert Harper",
  title =        "Homotopical patch theory",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "243--256",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628158",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Homotopy type theory is an extension of Martin-L{\"o}f
                 type theory, based on a correspondence with homotopy
                 theory and higher category theory. In homotopy type
                 theory, the propositional equality type becomes
                 proof-relevant, and corresponds to paths in a space.
                 This allows for a new class of datatypes, called higher
                 inductive types, which are specified by constructors
                 not only for points but also for paths. In this paper,
                 we consider a programming application of higher
                 inductive types. Version control systems such as Darcs
                 are based on the notion of patches --- syntactic
                 representations of edits to a repository. We show how
                 patch theory can be developed in homotopy type theory.
                 Our formulation separates formal theories of patches
                 from their interpretation as edits to repositories. A
                 patch theory is presented as a higher inductive type.
                 Models of a patch theory are given by maps out of that
                 type, which, being functors, automatically preserve the
                 structure of patches. Several standard tools of
                 homotopy theory come into play, demonstrating the use
                 of these methods in a practical programming context.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Cockx:2014:PMK,
  author =       "Jesper Cockx and Dominique Devriese and Frank
                 Piessens",
  title =        "Pattern matching without {K}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "257--268",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628139",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "Dependent pattern matching is an intuitive way to
                 write programs and proofs in dependently typed
                 languages. It is reminiscent of both pattern matching
                 in functional languages and case analysis in on-paper
                 mathematics. However, in general it is incompatible
                 with new type theories such as homotopy type theory
                 (HoTT). As a consequence, proofs in such theories are
                 typically harder to write and to understand. The source
                 of this incompatibility is the reliance of dependent
                 pattern matching on the so-called K axiom --- also
                 known as the uniqueness of identity proofs --- which is
                 inadmissible in HoTT. The Agda language supports an
                 experimental criterion to detect definitions by pattern
                 matching that make use of the K axiom, but so far it
                 lacked a formal correctness proof. In this paper, we
                 propose a new criterion for dependent pattern matching
                 without K, and prove it correct by a translation to
                 eliminators in the style of Goguen et al. (2006). Our
                 criterion both allows more good definitions than
                 existing proposals, and solves a previously undetected
                 problem in the criterion offered by Agda. It has been
                 implemented in Agda and is the first to be supported by
                 a formal proof. Thus it brings the benefits of
                 dependent pattern matching to contexts where we cannot
                 assume K, such as HoTT. It also points the way to new
                 forms of dependent pattern matching, for example on
                 higher inductive types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Vazou:2014:RTH,
  author =       "Niki Vazou and Eric L. Seidel and Ranjit Jhala and
                 Dimitrios Vytiniotis and Simon Peyton-Jones",
  title =        "Refinement types for {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "269--282",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628161",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "SMT-based checking of refinement types for
                 call-by-value languages is a well-studied subject.
                 Unfortunately, the classical translation of refinement
                 types to verification conditions is unsound under lazy
                 evaluation. When checking an expression, such systems
                 implicitly assume that all the free variables in the
                 expression are bound to values. This property is
                 trivially guaranteed by eager, but does not hold under
                 lazy, evaluation. Thus, to be sound and precise, a
                 refinement type system for Haskell and the
                 corresponding verification conditions must take into
                 account which subset of binders actually reduces to
                 values. We present a stratified type system that labels
                 binders as potentially diverging or not, and that
                 (circularly) uses refinement types to verify the
                 labeling. We have implemented our system in L IQUID H
                 ASKELL and present an experimental evaluation of our
                 approach on more than 10,000 lines of widely used
                 Haskell libraries. We show that L IQUID H ASKELL is
                 able to prove 96\% of all recursive functions
                 terminating, while requiring a modest 1.7 lines of
                 termination-annotations per 100 lines of code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Schwerter:2014:TGE,
  author =       "Felipe Ba{\~n}ados Schwerter and Ronald Garcia and
                 {\'E}ric Tanter",
  title =        "A theory of gradual effect systems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "283--295",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628149",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Effect systems have the potential to help software
                 developers, but their practical adoption has been very
                 limited. We conjecture that this limited adoption is
                 due in part to the difficulty of transitioning from a
                 system where effects are implicit and unrestricted to a
                 system with a static effect discipline, which must
                 settle for conservative checking in order to be
                 decidable. To address this hindrance, we develop a
                 theory of gradual effect checking, which makes it
                 possible to incrementally annotate and statically check
                 effects, while still rejecting statically inconsistent
                 programs. We extend the generic type-and-effect
                 framework of Marino and Millstein with a notion of
                 unknown effects, which turns out to be significantly
                 more subtle than unknown types in traditional gradual
                 typing. We appeal to abstract interpretation to develop
                 and validate the concepts of gradual effect checking.
                 We also demonstrate how an effect system formulated in
                 Marino and Millstein's framework can be automatically
                 extended to support gradual checking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{McBride:2014:HKY,
  author =       "Conor Thomas McBride",
  title =        "How to keep your neighbours in order",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "297--309",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628163",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "I present a datatype-generic treatment of recursive
                 container types whose elements are guaranteed to be
                 stored in increasing order, with the ordering invariant
                 rolled out systematically. Intervals, lists and binary
                 search trees are instances of the generic treatment. On
                 the journey to this treatment, I report a variety of
                 failed experiments and the transferable learning
                 experiences they triggered. I demonstrate that a total
                 element ordering is enough to deliver insertion and
                 flattening algorithms, and show that (with care about
                 the formulation of the types) the implementations
                 remain as usual. Agda's instance arguments and pattern
                 synonyms maximize the proof search done by the
                 typechecker and minimize the appearance of proofs in
                 program text, often eradicating them entirely.
                 Generalizing to indexed recursive container types,
                 invariants such as size and balance can be expressed in
                 addition to ordering. By way of example, I implement
                 insertion and deletion for 2-3 trees, ensuring both
                 order and balance by the discipline of type checking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Kaki:2014:RFH,
  author =       "Gowtham Kaki and Suresh Jagannathan",
  title =        "A relational framework for higher-order shape
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "311--324",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628159",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose the integration of a relational
                 specification framework within a dependent type system
                 capable of verifying complex invariants over the shapes
                 of algebraic datatypes. Our approach is based on the
                 observation that structural properties of such
                 datatypes can often be naturally expressed as
                 inductively-defined relations over the recursive
                 structure evident in their definitions. By interpreting
                 constructor applications (abstractly) in a relational
                 domain, we can define expressive relational
                 abstractions for a variety of complex data structures,
                 whose structural and shape invariants can be
                 automatically verified. Our specification language also
                 allows for definitions of parametric relations for
                 polymorphic data types that enable highly composable
                 specifications and naturally generalizes to
                 higher-order polymorphic functions. We describe an
                 algorithm that translates relational specifications
                 into a decidable fragment of first-order logic that can
                 be efficiently discharged by an SMT solver. We have
                 implemented these ideas in a type checker called
                 CATALYST that is incorporated within the MLton SML
                 compiler. Experimental results and case studies
                 indicate that our verification strategy is both
                 practical and effective.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Marlow:2014:TNF,
  author =       "Simon Marlow and Louis Brandy and Jonathan Coens and
                 Jon Purdy",
  title =        "There is no fork: an abstraction for efficient,
                 concurrent, and concise data access",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "325--337",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628144",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a new programming idiom for concurrency,
                 based on Applicative Functors, where concurrency is
                 implicit in the Applicative $ < * > $ operator. The
                 result is that concurrent programs can be written in a
                 natural applicative style, and they retain a high
                 degree of clarity and modularity while executing with
                 maximal concurrency. This idiom is particularly useful
                 for programming against external data sources, where
                 the application code is written without the use of
                 explicit concurrency constructs, while the
                 implementation is able to batch together multiple
                 requests for data from the same source, and fetch data
                 from multiple sources concurrently. Our abstraction
                 uses a cache to ensure that multiple requests for the
                 same data return the same result, which frees the
                 programmer from having to arrange to fetch data only
                 once, which in turn leads to greater modularity. While
                 it is generally applicable, our technique was designed
                 with a particular application in mind: an internal
                 service at Facebook that identifies particular types of
                 content and takes actions based on it. Our application
                 has a large body of business logic that fetches data
                 from several different external sources. The framework
                 described in this paper enables the business logic to
                 execute efficiently by automatically fetching data
                 concurrently; we present some preliminary results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Gibbons:2014:FDS,
  author =       "Jeremy Gibbons and Nicolas Wu",
  title =        "Folding domain-specific languages: deep and shallow
                 embeddings (functional Pearl)",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "339--347",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628138",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A domain-specific language can be implemented by
                 embedding within a general-purpose host language. This
                 embedding may be deep or shallow, depending on whether
                 terms in the language construct syntactic or semantic
                 representations. The deep and shallow styles are
                 closely related, and intimately connected to folds; in
                 this paper, we explore that connection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Fredriksson:2014:KNS,
  author =       "Olle Fredriksson and Dan R. Ghica",
  title =        "{Krivine} nets: a semantic foundation for distributed
                 execution",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "349--361",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628152",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We define a new approach to compilation to distributed
                 architectures based on networks of abstract machines.
                 Using it we can implement a generalised and fully
                 transparent form of Remote Procedure Call that supports
                 calling higher-order functions across node boundaries,
                 without sending actual code. Our starting point is the
                 classic Krivine machine, which implements reduction for
                 untyped call-by-name PCF. We successively add the
                 features that we need for distributed execution and
                 show the correctness of each addition. Then we
                 construct a two-level operational semantics, where the
                 high level is a network of communicating machines, and
                 the low level is given by local machine transitions.
                 Using these networks, we arrive at our final system,
                 the Krivine Net. We show that Krivine Nets give a
                 correct distributed implementation of the Krivine
                 machine, which preserves both termination and
                 non-termination properties. All the technical results
                 have been formalised and proved correct in Agda. We
                 also implement a prototype compiler which we compare
                 with previous distributing compilers based on Girard's
                 Geometry of Interaction and on Game Semantics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Accattoli:2014:DAM,
  author =       "Beniamino Accattoli and Pablo Barenbaum and Damiano
                 Mazza",
  title =        "Distilling abstract machines",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "9",
  pages =        "363--376",
  month =        sep,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2692915.2628154",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is well-known that many environment-based abstract
                 machines can be seen as strategies in lambda calculi
                 with explicit substitutions (ES). Recently, graphical
                 syntaxes and linear logic led to the linear
                 substitution calculus (LSC), a new approach to ES that
                 is halfway between small-step calculi and traditional
                 calculi with ES. This paper studies the relationship
                 between the LSC and environment-based abstract
                 machines. While traditional calculi with ES simulate
                 abstract machines, the LSC rather distills them: some
                 transitions are simulated while others vanish, as they
                 map to a notion of structural congruence. The
                 distillation process unveils that abstract machines in
                 fact implement weak linear head reduction, a notion of
                 evaluation having a central role in the theory of
                 linear logic. We show that such a pattern applies
                 uniformly in call-by-name, call-by-value, and
                 call-by-need, catching many machines in the literature.
                 We start by distilling the KAM, the CEK, and a sketch
                 of the ZINC, and then provide simplified versions of
                 the SECD, the lazy KAM, and Sestoft's machine. Along
                 the way we also introduce some new machines with global
                 environments. Moreover, we show that distillation
                 preserves the time complexity of the executions, i.e.
                 the LSC is a complexity-preserving abstraction of
                 abstract machines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '14 conference proceedings.",
}

@Article{Chong:2014:CCT,
  author =       "Stephen Chong",
  title =        "Checking correctness of {TypeScript} interfaces for
                 {JavaScript} libraries",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "1--16",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660215",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The TypeScript programming language adds optional
                 types to JavaScript, with support for interaction with
                 existing JavaScript libraries via interface
                 declarations. Such declarations have been written for
                 hundreds of libraries, but they can be difficult to
                 write and often contain errors, which may affect the
                 type checking and misguide code completion for the
                 application code in IDEs. We present a pragmatic
                 approach to check correctness of TypeScript declaration
                 files with respect to JavaScript library
                 implementations. The key idea in our algorithm is that
                 many declaration errors can be detected by an analysis
                 of the library initialization state combined with a
                 light-weight static analysis of the library function
                 code. Our experimental results demonstrate the
                 effectiveness of the approach: it has found 142 errors
                 in the declaration files of 10 libraries, with an
                 analysis time of a few minutes per library and with a
                 low number of false positives. Our analysis of how
                 programmers use library interface declarations
                 furthermore reveals some practical limitations of the
                 TypeScript type system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Andreasen:2014:DSA,
  author =       "Esben Andreasen and Anders M{\o}ller",
  title =        "Determinacy in static analysis for {jQuery}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "17--31",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660214",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Static analysis for JavaScript can potentially help
                 programmers find errors early during development.
                 Although much progress has been made on analysis
                 techniques, a major obstacle is the prevalence of
                 libraries, in particular jQuery, which apply
                 programming patterns that have detrimental consequences
                 on the analysis precision and performance. Previous
                 work on dynamic determinacy analysis has demonstrated
                 how information about program expressions that always
                 resolve to a fixed value in some call context may lead
                 to significant scalability improvements of static
                 analysis for such code. We present a static dataflow
                 analysis for JavaScript that infers and exploits
                 determinacy information on-the-fly, to enable analysis
                 of some of the most complex parts of jQuery. The
                 analysis combines selective context and path
                 sensitivity, constant propagation, and branch pruning,
                 based on a systematic investigation of the main causes
                 of analysis imprecision when using a more basic
                 analysis. The techniques are implemented in the TAJS
                 analysis tool and evaluated on a collection of small
                 programs that use jQuery. Our results show that the
                 proposed analysis techniques boost both precision and
                 performance, specifically for inferring type
                 information and call graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Pradel:2014:EAR,
  author =       "Michael Pradel and Parker Schuh and George Necula and
                 Koushik Sen",
  title =        "{EventBreak}: analyzing the responsiveness of user
                 interfaces through performance-guided test generation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "33--47",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660233",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Event-driven user interface applications typically
                 have a single thread of execution that processes event
                 handlers in response to input events triggered by the
                 user, the network, or other applications. Programmers
                 must ensure that event handlers terminate after a short
                 amount of time because otherwise, the application may
                 become unresponsive. This paper presents EventBreak, a
                 performance-guided test generation technique to
                 identify and analyze event handlers whose execution
                 time may gradually increase while using the
                 application. The key idea is to systematically search
                 for pairs of events where triggering one event
                 increases the execution time of the other event. For
                 example, this situation may happen because one event
                 accumulates data that is processed by the other event.
                 We implement the approach for JavaScript-based web
                 applications and apply it to three real-world
                 applications. EventBreak discovers events with an
                 execution time that gradually increases in an unbounded
                 way, which makes the application unresponsive, and
                 events that, if triggered repeatedly, reveal a severe
                 scalability problem, which makes the application
                 unusable. The approach reveals two known bugs and four
                 previously unknown responsiveness problems.
                 Furthermore, we show that EventBreak helps in testing
                 that event handlers avoid such problems by bounding a
                 handler's execution time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Hsiao:2014:UWC,
  author =       "Chun-Hung Hsiao and Michael Cafarella and Satish
                 Narayanasamy",
  title =        "Using web corpus statistics for program analysis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "49--65",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660226",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Several program analysis tools --- such as plagiarism
                 detection and bug finding --- rely on knowing a piece
                 of code's relative semantic importance. For example, a
                 plagiarism detector should not bother reporting two
                 programs that have an identical simple loop counter
                 test, but should report programs that share more
                 distinctive code. Traditional program analysis
                 techniques ( e.g., finding data and control
                 dependencies) are useful, but do not say how surprising
                 or common a line of code is. Natural language
                 processing researchers have encountered a similar
                 problem and addressed it using an n -gram model of text
                 frequency, derived from statistics computed over text
                 corpora. We propose and compute an n -gram model for
                 programming languages, computed over a corpus of 2.8
                 million JavaScript programs we downloaded from the Web.
                 In contrast to previous techniques, we describe a code
                 n -gram as a subgraph of the program dependence graph
                 that contains all nodes and edges reachable in n steps
                 from the statement. We can count n -grams in a program
                 and count the frequency of n -grams in the corpus,
                 enabling us to compute tf-idf -style measures that
                 capture the differing importance of different lines of
                 code. We demonstrate the power of this approach by
                 implementing a plagiarism detector with accuracy that
                 beats previous techniques, and a bug-finding tool that
                 discovered over a dozen previously unknown bugs in a
                 collection of real deployed programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Barr:2014:TAT,
  author =       "Earl T. Barr and Mark Marron",
  title =        "{Tardis}: affordable time-travel debugging in managed
                 runtimes",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "67--82",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660209",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Developers who set a breakpoint a few statements too
                 late or who are trying to diagnose a subtle bug from a
                 single core dump often wish for a time-traveling
                 debugger. The ability to rewind time to see the exact
                 sequence of statements and program values leading to an
                 error has great intuitive appeal but, due to large time
                 and space overheads, time traveling debuggers have seen
                 limited adoption. A managed runtime, such as the Java
                 JVM or a JavaScript engine, has already paid much of
                 the cost of providing core features --- type safety,
                 memory management, and virtual IO --- that can be
                 reused to implement a low overhead time-traveling
                 debugger. We leverage this insight to design and build
                 affordable time-traveling debuggers for managed
                 languages. Tardis realizes our design: it provides
                 affordable time-travel with an average overhead of only
                 7\% during normal execution, a rate of 0.6MB/s of
                 history logging, and a worst-case 0.68s time-travel
                 latency on our benchmark applications. Tardis can also
                 debug optimized code using time-travel to reconstruct
                 state. This capability, coupled with its low overhead,
                 makes Tardis suitable for use as the default debugger
                 for managed languages, promising to bring
                 time-traveling debugging into the mainstream and
                 transform the practice of debugging.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Bell:2014:PID,
  author =       "Jonathan Bell and Gail Kaiser",
  title =        "{Phosphor}: illuminating dynamic data flow in
                 commodity {JVMs}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "83--101",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660212",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Dynamic taint analysis is a well-known information
                 flow analysis problem with many possible applications.
                 Taint tracking allows for analysis of application data
                 flow by assigning labels to data, and then propagating
                 those labels through data flow. Taint tracking systems
                 traditionally compromise among performance, precision,
                 soundness, and portability. Performance can be
                 critical, as these systems are often intended to be
                 deployed to production environments, and hence must
                 have low overhead. To be deployed in security-conscious
                 settings, taint tracking must also be sound and
                 precise. Dynamic taint tracking must be portable in
                 order to be easily deployed and adopted for real world
                 purposes, without requiring recompilation of the
                 operating system or language interpreter, and without
                 requiring access to application source code. We present
                 Phosphor, a dynamic taint tracking system for the Java
                 Virtual Machine (JVM) that simultaneously achieves our
                 goals of performance, soundness, precision, and
                 portability. Moreover, to our knowledge, it is the
                 first portable general purpose taint tracking system
                 for the JVM. We evaluated Phosphor 's performance on
                 two commonly used JVM languages (Java and Scala), on
                 two successive revisions of two commonly used JVMs
                 (Oracle's HotSpot and OpenJDK's IcedTea) and on
                 Android's Dalvik Virtual Machine, finding its
                 performance to be impressive: as low as 3\% (53\% on
                 average; 220\% at worst) using the DaCapo macro
                 benchmark suite. This paper describes our approach
                 toward achieving portable taint tracking in the JVM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Pina:2014:RDJ,
  author =       "Lu{\'\i}s Pina and Lu{\'\i}s Veiga and Michael Hicks",
  title =        "{Rubah}: {DSU} for {Java} on a stock {JVM}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "103--119",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660220",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents Rubah, the first dynamic software
                 updating system for Java that: is portable, implemented
                 via libraries and bytecode rewriting on top of a
                 standard JVM; is efficient, imposing essentially no
                 overhead on normal, steady-state execution; is
                 flexible, allowing nearly arbitrary changes to classes
                 between updates; and is non-disruptive, employing
                 either a novel eager algorithm that transforms the
                 program state with multiple threads, or a novel lazy
                 algorithm that transforms objects as they are demanded,
                 post-update. Requiring little programmer effort, Rubah
                 has been used to dynamically update five long-running
                 applications: the H2 database, the Voldemort key-value
                 store, the Jake2 implementation of the Quake 2 shooter
                 game, the CrossFTP server, and the JavaEmailServer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Shahriyar:2014:FCG,
  author =       "Rifat Shahriyar and Stephen M. Blackburn and Kathryn
                 S. McKinley",
  title =        "Fast conservative garbage collection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "121--139",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660198",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Garbage collectors are exact or conservative. An exact
                 collector identifies all references precisely and may
                 move referents and update references, whereas a
                 conservative collector treats one or more of stack,
                 register, and heap references as ambiguous. Ambiguous
                 references constrain collectors in two ways. (1) Since
                 they may be pointers, the collectors must retain
                 referents. (2) Since they may be values, the collectors
                 cannot modify them, pinning their referents. We explore
                 conservative collectors for managed languages, with
                 ambiguous stacks and registers. We show that for Java
                 benchmarks they retain and pin remarkably few heap
                 objects: $ < 0.01 \% $ are falsely retained and 0.03\%
                 are pinned. The larger effect is collector design.
                 Prior conservative collectors (1) use mark-sweep and
                 unnecessarily forgo moving all objects, or (2) use
                 mostly copying and pin entire pages. Compared to
                 generational collection, overheads are substantial:
                 12\% and 45\% respectively. We introduce high
                 performance conservative Immix and reference counting
                 (RC). Immix is a mark-region collector with fine line
                 -grain pinning and opportunistic copying of unambiguous
                 referents. Deferred RC simply needs an object map to
                 deliver the first conservative RC. We implement six
                 exact collectors and their conservative counterparts.
                 Conservative Immix and RC come within 2 to 3\% of their
                 exact counterparts. In particular, conservative RC
                 Immix is slightly faster than a well-tuned exact
                 generational collector. These findings show that for
                 managed languages, conservative collection is
                 compatible with high performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Holk:2014:RBM,
  author =       "Eric Holk and Ryan Newton and Jeremy Siek and Andrew
                 Lumsdaine",
  title =        "Region-based memory management for {GPU} programming
                 languages: enabling rich data structures on a spartan
                 host",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "141--155",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660244",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphics processing units (GPUs) can effectively
                 accelerate many applications, but their applicability
                 has been largely limited to problems whose solutions
                 can be expressed neatly in terms of linear algebra.
                 Indeed, most GPU programming languages limit the user
                 to simple data structures --- typically only
                 multidimensional rectangular arrays of scalar values.
                 Many algorithms are more naturally expressed using
                 higher level language features, such as algebraic data
                 types (ADTs) and first class procedures, yet building
                 these structures in a manner suitable for a GPU remains
                 a challenge. We present a region-based memory
                 management approach that enables rich data structures
                 in Harlan, a language for data parallel computing.
                 Regions enable rich data structures by providing a
                 uniform representation for pointers on both the CPU and
                 GPU and by providing a means of transferring entire
                 data structures between CPU and GPU memory. We
                 demonstrate Harlan's increased expressiveness on
                 several example programs and show that Harlan performs
                 well on more traditional data-parallel problems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Uhler:2014:SSB,
  author =       "Richard Uhler and Nirav Dave",
  title =        "{Smten} with satisfiability-based search",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "157--176",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660208",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Satisfiability (SAT) and Satisfiability Modulo
                 Theories (SMT) have been used in solving a wide variety
                 of important and challenging problems, including
                 automatic test generation, model checking, and program
                 synthesis. For these applications to scale to larger
                 problem instances, developers cannot rely solely on the
                 sophistication of SAT and SMT solvers to efficiently
                 solve their queries; they must also optimize their own
                 orchestration and construction of queries. We present
                 Smten, a high-level language for orchestrating and
                 constructing satisfiability-based search queries. We
                 show that applications developed using Smten require
                 significantly fewer lines of code and less developer
                 effort to achieve results comparable to standard
                 SMT-based tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Bosboom:2014:SCC,
  author =       "Jeffrey Bosboom and Sumanaruban Rajadurai and Weng-Fai
                 Wong and Saman Amarasinghe",
  title =        "{StreamJIT}: a commensal compiler for high-performance
                 stream programming",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "177--195",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660236",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There are many domain libraries, but despite the
                 performance benefits of compilation, domain-specific
                 languages are comparatively rare due to the high cost
                 of implementing an optimizing compiler. We propose
                 commensal compilation, a new strategy for compiling
                 embedded domain-specific languages by reusing the
                 massive investment in modern language virtual machine
                 platforms. Commensal compilers use the host language's
                 front-end, use host platform APIs that enable back-end
                 optimizations by the host platform JIT, and use an
                 autotuner for optimization selection. The cost of
                 implementing a commensal compiler is only the cost of
                 implementing the domain-specific optimizations. We
                 demonstrate the concept by implementing a commensal
                 compiler for the stream programming language StreamJIT
                 atop the Java platform. Our compiler achieves
                 performance 2.8 times better than the StreamIt native
                 code (via GCC) compiler with considerably less
                 implementation effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Tosch:2014:SPA,
  author =       "Emma Tosch and Emery D. Berger",
  title =        "{SurveyMan}: programming and automatically debugging
                 surveys",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "197--211",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660206",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Surveys can be viewed as programs, complete with
                 logic, control flow, and bugs. Word choice or the order
                 in which questions are asked can unintentionally bias
                 responses. Vague, confusing, or intrusive questions can
                 cause respondents to abandon a survey. Surveys can also
                 have runtime errors: inattentive respondents can taint
                 results. This effect is especially problematic when
                 deploying surveys in uncontrolled settings, such as on
                 the web or via crowdsourcing platforms. Because the
                 results of surveys drive business decisions and inform
                 scientific conclusions, it is crucial to make sure they
                 are correct. We present SurveyMan, a system for
                 designing, deploying, and automatically debugging
                 surveys. Survey authors write their surveys in a
                 lightweight domain-specific language aimed at end
                 users. SurveyMan statically analyzes the survey to
                 provide feedback to survey authors before deployment.
                 It then compiles the survey into JavaScript and deploys
                 it either to the web or a crowdsourcing platform.
                 SurveyMan 's dynamic analyses automatically find survey
                 bugs, and control for the quality of responses. We
                 evaluate SurveyMan 's algorithms analytically and
                 empirically, demonstrating its effectiveness with case
                 studies of social science surveys conducted via
                 Amazon's Mechanical Turk.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Bartenstein:2014:RTS,
  author =       "Thomas W. Bartenstein and Yu David Liu",
  title =        "Rate types for stream programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "213--232",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660225",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce RATE TYPES, a novel type system to reason
                 about and optimize data-intensive programs. Built
                 around stream languages, RATE TYPES performs static
                 quantitative reasoning about stream rates --- the
                 frequency of data items in a stream being consumed,
                 processed, and produced. Despite the fact that streams
                 are fundamentally dynamic, we find two essential
                 concepts of stream rate control --- throughput ratio
                 and natural rate --- are intimately related to the
                 program structure itself and can be effectively
                 reasoned about by a type system. RATE TYPES is proven
                 to correspond with a time-aware and parallelism-aware
                 operational semantics. The strong correspondence result
                 tolerates arbitrary schedules, and does not require any
                 synchronization between stream filters.We further
                 implement RATE TYPES, demonstrating its effectiveness
                 in predicting stream data rates in real-world stream
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Amin:2014:FPD,
  author =       "Nada Amin and Tiark Rompf and Martin Odersky",
  title =        "Foundations of path-dependent types",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "233--249",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660216",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A scalable programming language is one in which the
                 same concepts can describe small as well as large
                 parts. Towards this goal, Scala unifies concepts from
                 object and module systems. An essential ingredient of
                 this unification is the concept of objects with type
                 members, which can be referenced through path-dependent
                 types. Unfortunately, path-dependent types are not
                 well-understood, and have been a roadblock in grounding
                 the Scala type system on firm theory. We study several
                 calculi for path-dependent types. We present DOT which
                 captures the essence --- DOT stands for Dependent
                 Object Types. We explore the design space bottom-up,
                 teasing apart inherent from accidental complexities,
                 while fully mechanizing our models at each step. Even
                 in this simple setting, many interesting patterns arise
                 from the interaction of structural and nominal
                 features. Whereas our simple calculus enjoys many
                 desirable and intuitive properties, we demonstrate that
                 the theory gets much more complicated once we add
                 another Scala feature, type refinement, or extend the
                 subtyping relation to a lattice. We discuss possible
                 remedies and trade-offs in modeling type systems for
                 Scala-like languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Allende:2014:CGT,
  author =       "Esteban Allende and Johan Fabry and Ronald Garcia and
                 {\'E}ric Tanter",
  title =        "Confined gradual typing",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "251--270",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660222",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Gradual typing combines static and dynamic typing
                 flexibly and safely in a single programming language.
                 To do so, gradually typed languages implicitly insert
                 casts where needed, to ensure at runtime that typing
                 assumptions are not violated by untyped code. However,
                 the implicit nature of cast insertion, especially on
                 higher-order values, can jeopardize reliability and
                 efficiency: higher-order casts can fail at any time,
                 and are costly to execute. We propose Confined Gradual
                 Typing, which extends gradual typing with two new type
                 qualifiers that let programmers control the flow of
                 values between the typed and the untyped worlds, and
                 thereby trade some flexibility for more reliability and
                 performance. We formally develop two variants of
                 Confined Gradual Typing that capture different
                 flexibility/guarantee tradeoffs. We report on the
                 implementation of Confined Gradual Typing in
                 Gradualtalk, a gradually-typed Smalltalk, which
                 confirms the performance advantage of avoiding unwanted
                 higher-order casts and the low overhead of the
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Altidor:2014:RJG,
  author =       "John Altidor and Yannis Smaragdakis",
  title =        "Refactoring {Java} generics by inferring wildcards, in
                 practice",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "271--290",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660203",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Wildcard annotations can improve the generality of
                 Java generic libraries, but require heavy manual
                 effort. We present an algorithm for refactoring and
                 inferring more general type instantiations of Java
                 generics using wildcards. Compared to past approaches,
                 our work is practical and immediately applicable: we
                 assume no changes to the Java type system, while taking
                 into account all its intricacies. Our system allows
                 users to select declarations (variables, method
                 parameters, return types, etc.) to generalize and
                 considers declarations not declared in available source
                 code. It then performs an inter-procedural flow
                 analysis and a method body analysis, in order to
                 generalize type signatures. We evaluate our technique
                 on six Java generic libraries. We find that 34\% of
                 available declarations of variant type signatures can
                 be generalized --- i.e., relaxed with more general
                 wildcard types. On average, 146 other declarations need
                 to be updated when a declaration is generalized,
                 showing that this refactoring would be too tedious and
                 error-prone to perform manually.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{David:2014:CMC,
  author =       "Florian David and Gael Thomas and Julia Lawall and
                 Gilles Muller",
  title =        "Continuously measuring critical section pressure with
                 the free-lunch profiler",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "291--307",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660210",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today, Java is regularly used to implement large
                 multi-threaded server-class applications that use locks
                 to protect access to shared data. However,
                 understanding the impact of locks on the performance of
                 a system is complex, and thus the use of locks can
                 impede the progress of threads on configurations that
                 were not anticipated by the developer, during specific
                 phases of the execution. In this paper, we propose Free
                 Lunch, a new lock profiler for Java application
                 servers, specifically designed to identify, in-vivo,
                 phases where the progress of the threads is impeded by
                 a lock. Free Lunch is designed around a new metric,
                 critical section pressure (CSP), which directly
                 correlates the progress of the threads to each of the
                 locks. Using Free Lunch, we have identified phases of
                 high CSP, which were hidden with other lock profilers,
                 in the distributed Cassandra NoSQL database and in
                 several applications from the DaCapo 9.12, the
                 SPECjvm2008 and the SPECjbb2005 benchmark suites. Our
                 evaluation of Free Lunch shows that its overhead is
                 never greater than 6\%, making it suitable for in-vivo
                 use.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Misailovic:2014:CRA,
  author =       "Sasa Misailovic and Michael Carbin and Sara Achour and
                 Zichao Qi and Martin C. Rinard",
  title =        "{Chisel}: reliability- and accuracy-aware optimization
                 of approximate computational kernels",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "309--328",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660231",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The accuracy of an approximate computation is the
                 distance between the result that the computation
                 produces and the corresponding fully accurate result.
                 The reliability of the computation is the probability
                 that it will produce an acceptably accurate result.
                 Emerging approximate hardware platforms provide
                 approximate operations that, in return for reduced
                 energy consumption and/or increased performance,
                 exhibit reduced reliability and/or accuracy. We present
                 Chisel, a system for reliability- and accuracy-aware
                 optimization of approximate computational kernels that
                 run on approximate hardware platforms. Given a combined
                 reliability and/or accuracy specification, Chisel
                 automatically selects approximate kernel operations to
                 synthesize an approximate computation that minimizes
                 energy consumption while satisfying its reliability and
                 accuracy specification. We evaluate Chisel on five
                 applications from the image processing, scientific
                 computing, and financial analysis domains. The
                 experimental results show that our implemented
                 optimization algorithm enables Chisel to optimize our
                 set of benchmark kernels to obtain energy savings from
                 8.7\% to 19.8\% compared to the fully reliable kernel
                 implementations while preserving important reliability
                 guarantees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Kambadur:2014:ESE,
  author =       "Melanie Kambadur and Martha A. Kim",
  title =        "An experimental survey of energy management across the
                 stack",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "329--344",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660196",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern demand for energy-efficient computation has
                 spurred research at all levels of the stack, from
                 devices to microarchitecture, operating systems,
                 compilers, and languages. Unfortunately, this breadth
                 has resulted in a disjointed space, with technologies
                 at different levels of the system stack rarely
                 compared, let alone coordinated. This work begins to
                 remedy the problem, conducting an experimental survey
                 of the present state of energy management across the
                 stack. Focusing on settings that are exposed to
                 software, we measure the total energy, average power,
                 and execution time of 41 benchmark applications in 220
                 configurations, across a total of 200,000 program
                 executions. Some of the more important findings of the
                 survey include that effective parallelization and
                 compiler optimizations have the potential to save far
                 more energy than Linux's frequency tuning algorithms;
                 that certain non-complementary energy strategies can
                 undercut each other's savings by half when combined;
                 and that while the power impacts of most strategies
                 remain constant across applications, the runtime
                 impacts vary, resulting in inconsistent energy
                 impacts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Pinto:2014:UEB,
  author =       "Gustavo Pinto and Fernando Castor and Yu David Liu",
  title =        "Understanding energy behaviors of thread management
                 constructs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "345--360",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660235",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Java programmers are faced with numerous choices in
                 managing concurrent execution on multicore platforms.
                 These choices often have different trade-offs (e.g.,
                 performance, scalability, and correctness guarantees).
                 This paper analyzes an additional dimension, energy
                 consumption. It presents an empirical study aiming to
                 illuminate the relationship between the choices and
                 settings of thread management constructs and energy
                 consumption. We consider three important thread
                 management constructs in concurrent programming:
                 explicit thread creation, fixed-size thread pooling,
                 and work stealing. We further shed light on the
                 energy/performance trade-off of three ``tuning knobs''
                 of these constructs: the number of threads, the task
                 division strategy, and the characteristics of processed
                 data. Through an extensive experimental space
                 exploration over real-world Java programs, we produce a
                 list of findings about the energy behaviors of
                 concurrent programs, which are not always obvious. The
                 study serves as a first step toward improving energy
                 efficiency of concurrent programs on parallel
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Drechsler:2014:DRU,
  author =       "Joscha Drechsler and Guido Salvaneschi and Ragnar Mogk
                 and Mira Mezini",
  title =        "Distributed {REScala}: an update algorithm for
                 distributed reactive programming",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "361--376",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660240",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reactive programming improves the design of reactive
                 applications by relocating the logic for managing
                 dependencies between dependent values away from the
                 application logic to the language implementation. Many
                 distributed applications are reactive. Yet, existing
                 change propagation algorithms are not suitable in a
                 distributed setting. We propose Distributed REScala, a
                 reactive language with a change propagation algorithm
                 that works without centralized knowledge about the
                 topology of the dependency structure among reactive
                 values and avoids unnecessary propagation of changes,
                 while retaining safety guarantees ( glitch freedom ).
                 Distributed REScala enables distributed reactive
                 programming, bringing the benefits of reactive
                 programming to distributed applications. We demonstrate
                 the enabled design improvements by a case study. We
                 also empirically evaluate the performance of our
                 algorithm in comparison to other algorithms in a
                 simulated distributed setting.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Rendel:2014:OAA,
  author =       "Tillmann Rendel and Jonathan Immanuel Brachth{\"a}user
                 and Klaus Ostermann",
  title =        "From object algebras to attribute grammars",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "377--395",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660237",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Oliveira and Cook (2012) and Oliveira et al. (2013)
                 have recently introduced object algebras as a program
                 structuring technique to improve the modularity and
                 extensibility of programs. We analyze the relationship
                 between object algebras and attribute grammars (AGs), a
                 formalism to augment context-free grammars with
                 attributes. We present an extension of the object
                 algebra technique with which the full class of
                 L-attributed grammars --- an important class of AGs
                 that corresponds to one-pass compilers --- can be
                 encoded in Scala. The encoding is modular (attributes
                 can be defined and type-checked separately), scalable
                 (the size of the encoding is linear in the size of the
                 AG specification) and compositional (each AG artifact
                 is represented as a semantic object of the host
                 language). To evaluate these claims, we have formalized
                 the encoding and re-implemented a one-pass compiler for
                 a subset of C with our technique. We also discuss how
                 advanced features of modern AG systems, such as
                 higher-order and parameterized attributes, reference
                 attributes, and forwarding can be supported.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Ureche:2014:LDL,
  author =       "Vlad Ureche and Eugene Burmako and Martin Odersky",
  title =        "Late data layout: unifying data representation
                 transformations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "397--416",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660197",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Values need to be represented differently when
                 interacting with certain language features. For
                 example, an integer has to take an object-based
                 representation when interacting with erased generics,
                 although, for performance reasons, the stack-based
                 value representation is better. To abstract over these
                 implementation details, some programming languages
                 choose to expose a unified high-level concept (the
                 integer) and let the compiler choose its exact
                 representation and insert coercions where necessary.
                 This pattern appears in multiple language features such
                 as value classes, specialization and multi-stage
                 programming: they all expose a unified concept which
                 they later refine into multiple representations. Yet,
                 the underlying compiler implementations typically
                 entangle the core mechanism with assumptions about the
                 alternative representations and their interaction with
                 other language features. In this paper we present the
                 Late Data Layout mechanism, a simple but versatile
                 type-driven generalization that subsumes and improves
                 the state-of-the-art representation transformations. In
                 doing so, we make two key observations: (1) annotated
                 types conveniently capture the semantics of using
                 multiple representations and (2) local type inference
                 can be used to consistently and optimally introduce
                 coercions. We validated our approach by implementing
                 three language features as Scala compiler extensions:
                 value classes, specialization (using the miniboxing
                 representation) and a simplified multi-stage
                 programming mechanism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Mitschke:2014:ILI,
  author =       "Ralf Mitschke and Sebastian Erdweg and Mirko
                 K{\"o}hler and Mira Mezini and Guido Salvaneschi",
  title =        "{i3QL}: language-integrated live data views",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "417--432",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660242",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "An incremental computation updates its result based on
                 a change to its input, which is often an order of
                 magnitude faster than a recomputation from scratch. In
                 particular, incrementalization can make expensive
                 computations feasible for settings that require short
                 feedback cycles, such as interactive systems, IDEs, or
                 (soft) real-time systems. This paper presents i3QL, a
                 general-purpose programming language for specifying
                 incremental computations. i3QL provides a declarative
                 SQL-like syntax and is based on incremental versions of
                 operators from relational algebra, enriched with
                 support for general recursion. We integrated i3QL into
                 Scala as a library, which enables programmers to use
                 regular Scala code for non-incremental subcomputations
                 of an i3QL query and to easily integrate incremental
                 computations into larger software projects. To improve
                 performance, i3QL optimizes user-defined queries by
                 applying algebraic laws and partial evaluation. We
                 describe the design and implementation of i3QL and its
                 optimizations, demonstrate its applicability, and
                 evaluate its performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Chakrabarti:2014:ALL,
  author =       "Dhruva R. Chakrabarti and Hans-J. Boehm and Kumud
                 Bhandari",
  title =        "{Atlas}: leveraging locks for non-volatile memory
                 consistency",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "433--452",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660224",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Non-volatile main memory, such as memristors or phase
                 change memory, can revolutionize the way programs
                 persist data. In-memory objects can themselves be
                 persistent without the need for a separate persistent
                 data storage format. However, the challenge is to
                 ensure that such data remains consistent if a failure
                 occurs during execution. In this paper, we present our
                 system, called Atlas, which adds durability semantics
                 to lock-based code, typically allowing us to
                 automatically maintain a globally consistent state even
                 in the presence of failures. We identify failure-atomic
                 sections of code based on existing critical sections
                 and describe a log-based implementation that can be
                 used to recover a consistent state after a failure. We
                 discuss several subtle semantic issues and
                 implementation tradeoffs. We confirm the ability to
                 rapidly flush CPU caches as a core implementation
                 bottleneck and suggest partial solutions. Experimental
                 results confirm the practicality of our approach and
                 provide insight into the overheads of such a system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Steele:2014:FSP,
  author =       "Guy L. {Steele, Jr.} and Doug Lea and Christine H.
                 Flood",
  title =        "Fast splittable pseudorandom number generators",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "453--472",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660195",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/m/marsaglia-george.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/jstatsoft.bib;
                 https://www.math.utah.edu/pub/tex/bib/mathcw.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomacs.bib",
  abstract =     "We describe a new algorithm SplitMix for an
                 object-oriented and splittable pseudorandom number
                 generator (PRNG) that is quite fast: 9 64-bit
                 arithmetic/logical operations per 64 bits generated. A
                 conventional linear PRNG object provides a generate
                 method that returns one pseudorandom value and updates
                 the state of the PRNG, but a splittable PRNG object
                 also has a second operation, split, that replaces the
                 original PRNG object with two (seemingly) independent
                 PRNG objects, by creating and returning a new such
                 object and updating the state of the original object.
                 Splittable PRNG objects make it easy to organize the
                 use of pseudorandom numbers in multithreaded programs
                 structured using fork-join parallelism. No locking or
                 synchronization is required (other than the usual
                 memory fence immediately after object creation).
                 Because the generate method has no loops or
                 conditionals, it is suitable for SIMD or GPU
                 implementation. We derive SplitMix from the DotMix
                 algorithm of Leiserson, Schardl, and Sukha by making a
                 series of program transformations and engineering
                 improvements. The end result is an object-oriented
                 version of the purely functional API used in the
                 Haskell library for over a decade, but SplitMix is
                 faster and produces pseudorandom sequences of higher
                 quality; it is also far superior in quality and speed
                 to java.util.Random, and has been included in Java JDK8
                 as the class java.util.SplittableRandom. We have tested
                 the pseudorandom sequences produced by SplitMix using
                 two standard statistical test suites (DieHarder and
                 TestU01) and they appear to be adequate for
                 ``everyday'' use, such as in Monte Carlo algorithms and
                 randomized data structures where speed is important.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark-1 =     "OOPSLA '14 conference proceedings.",
  remark-2 =     "On page 466, the authors describe an interesting
                 technique for improving a user-supplied seed that might
                 produce insufficient randomness in the next several
                 members of the random-number sequence: ``Long runs of
                 0-bits or of 1-bits in the $\gamma$ [candidate seed]
                 value do not cause bits of the seed to flip; an
                 approximate proxy for how many bits of the seed will
                 flip might be the number of bit pairs of the form 01 or
                 10 in the candidate $\gamma$ value {\tt z}. Therefore
                 we require that the number of such pairs, as computed
                 by {\tt Long.bitCount(z ^ (z >>> 1))}, exceed 24; if it
                 does not, then the candidate z is replaced by the XOR
                 of {\tt z} and {\tt 0xaaaaaaaaaaaaaaaaL}, a constant
                 chosen so that (a) the low bit of {\tt z} remains 1,
                 and (b) every bit pair of the form 00 or 11 becomes
                 either 01 or 10, and likewise every bit pair of the
                 form 01 or 10 becomes either 00 or 11, so the new value
                 necessarily has more than 24 bit pairs whose bits
                 differ. Testing shows that this trick appears to be
                 effective.''",
  remark-3 =     "From page 468: ``we did three runs of TestU01 BigCrush
                 on {\tt java.util.Random}; 19 tests produced clear
                 failure on all three runs. These included 9 Birthday
                 Spacings tests, 8 ClosePairs tests, a WeightDistrib
                 test, and a CouponCollector test. This confirms
                 L'Ecuyer's observation that {\tt java.util.Random}
                 tends to fail Birthday Spacings tests [17].'' The
                 reference is to \cite{LEcuyer:2001:SUR}.",
  remark-4 =     "From page 470: ``[L'Ecuyer] comments, `In the Java
                 class {\tt java.util.Random}, RNG streams can be
                 declared and constructed dynamically, without limit on
                 their number. However, no precaution seems to have been
                 taken regarding the independence of these streams.'''",
  remark-5 =     "From page 471: ``They [the generators in this paper]
                 should not be used for cryptographic or security
                 applications, because they are too predictable (the
                 mixing functions are easily inverted, and two
                 successive outputs suffice to reconstruct the internal
                 state), \ldots{} One version seems especially suitable
                 for use as a replacement for {\tt java.util.Random},
                 because it produces sequences of higher quality, is
                 faster in sequential use, is easily parallelized for
                 use in JDK8 stream expressions, and is amenable to
                 efficient implementation on SIMD and GPU
                 architectures.''",
}

@Article{Samak:2014:MTS,
  author =       "Malavika Samak and Murali Krishna Ramanathan",
  title =        "Multithreaded test synthesis for deadlock detection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "473--489",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660238",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Designing and implementing thread-safe multithreaded
                 libraries can be a daunting task as developers of these
                 libraries need to ensure that their implementations are
                 free from concurrency bugs, including deadlocks. The
                 usual practice involves employing software testing
                 and/or dynamic analysis to detect deadlocks. Their
                 effectiveness is dependent on well-designed
                 multithreaded test cases. Unsurprisingly, developing
                 multithreaded tests is significantly harder than
                 developing sequential tests for obvious reasons. In
                 this paper, we address the problem of automatically
                 synthesizing multithreaded tests that can induce
                 deadlocks. The key insight to our approach is that a
                 subset of the properties observed when a deadlock
                 manifests in a concurrent execution can also be
                 observed in a single threaded execution. We design a
                 novel, automatic, scalable and directed approach that
                 identifies these properties and synthesizes a deadlock
                 revealing multithreaded test. The input to our approach
                 is the library implementation under consideration and
                 the output is a set of deadlock revealing multithreaded
                 tests. We have implemented our approach as part of a
                 tool, named OMEN$^1$. OMEN is able to synthesize
                 multithreaded tests on many multithreaded Java
                 libraries. Applying a dynamic deadlock detector on the
                 execution of the synthesized tests results in the
                 detection of a number of deadlocks, including 35 real
                 deadlocks in classes documented as thread-safe.
                 Moreover, our experimental results show that dynamic
                 analysis on multithreaded tests that are either
                 synthesized randomly or developed by third-party
                 programmers are ineffective in detecting the
                 deadlocks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Bergan:2014:SEM,
  author =       "Tom Bergan and Dan Grossman and Luis Ceze",
  title =        "Symbolic execution of multithreaded programs from
                 arbitrary program contexts",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "491--506",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660200",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe an algorithm to perform symbolic execution
                 of a multithreaded program starting from an arbitrary
                 program context. We argue that this can enable more
                 efficient symbolic exploration of deep code paths in
                 multithreaded programs by allowing the symbolic engine
                 to jump directly to program contexts of interest. The
                 key challenge is modeling the initial context with
                 reasonable precision --- an overly approximate model
                 leads to exploration of many infeasible paths during
                 symbolic execution, while a very precise model would be
                 so expensive to compute that computing it would defeat
                 the purpose of jumping directly to the initial context
                 in the first place. We propose a context-specific
                 dataflow analysis that approximates the initial context
                 cheaply, but precisely enough to avoid some common
                 causes of infeasible-path explosion. This model is
                 necessarily approximate --- it may leave portions of
                 the memory state unconstrained, leaving our symbolic
                 execution unable to answer simple questions such as
                 ``which thread holds lock A?''. For such cases, we
                 describe a novel algorithm for evaluating symbolic
                 synchronization during symbolic execution. Our symbolic
                 execution semantics are sound and complete up to the
                 limits of the underlying SMT solver. We describe
                 initial experiments on an implementation in Cloud 9.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Barowy:2014:CDD,
  author =       "Daniel W. Barowy and Dimitar Gochev and Emery D.
                 Berger",
  title =        "{CheckCell}: data debugging for spreadsheets",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "507--523",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660207",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Testing and static analysis can help root out bugs in
                 programs, but not in data. This paper introduces data
                 debugging, an approach that combines program analysis
                 and statistical analysis to automatically find
                 potential data errors. Since it is impossible to know a
                 priori whether data are erroneous, data debugging
                 instead locates data that has a disproportionate impact
                 on the computation. Such data is either very important,
                 or wrong. Data debugging is especially useful in the
                 context of data-intensive programming environments that
                 intertwine data with programs in the form of queries or
                 formulas. We present the first data debugging tool,
                 CheckCell, an add-in for Microsoft Excel. CheckCell
                 identifies cells that have an unusually high impact on
                 the spreadsheet's computations. We show that CheckCell
                 is both analytically and empirically fast and
                 effective. We show that it successfully finds injected
                 typographical errors produced by a generative model
                 trained with data entry from 169,112 Mechanical Turk
                 tasks. CheckCell is more precise and efficient than
                 standard outlier detection techniques. CheckCell also
                 automatically identifies a key flaw in the infamous
                 Reinhart and Rogoff spreadsheet.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Pavlinovic:2014:FMT,
  author =       "Zvonimir Pavlinovic and Tim King and Thomas Wies",
  title =        "Finding minimum type error sources",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "525--542",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660230",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Automatic type inference is a popular feature of
                 functional programming languages. If a program cannot
                 be typed, the compiler typically reports a single
                 program location in its error message. This location is
                 the point where the type inference failed, but not
                 necessarily the actual source of the error. Other
                 potential error sources are not even considered. Hence,
                 the compiler often misses the true error source, which
                 increases debugging time for the programmer. In this
                 paper, we present a general framework for automatic
                 localization of type errors. Our algorithm finds all
                 minimum error sources, where the exact definition of
                 minimum is given in terms of a compiler-specific
                 ranking criterion. Compilers can use minimum error
                 sources to produce more meaningful error reports, and
                 for automatic error correction. Our approach works by
                 reducing the search for minimum error sources to an
                 optimization problem that we formulate in terms of
                 weighted maximum satisfiability modulo theories
                 (MaxSMT). The reduction to weighted MaxSMT allows us to
                 build on SMT solvers to support rich type systems and
                 at the same time abstract from the concrete criterion
                 that is used for ranking the error sources. We have
                 implemented an instance of our framework targeted at
                 Hindley-Milner type systems and evaluated it on
                 existing OCaml benchmarks for type error localization.
                 Our evaluation shows that our approach has the
                 potential to significantly improve the quality of type
                 error reports produced by state of the art compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Liu:2014:FFL,
  author =       "Peng Liu and Omer Tripp and Xiangyu Zhang",
  title =        "{Flint}: fixing linearizability violations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "543--560",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660217",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Writing concurrent software while achieving both
                 correctness and efficiency is a grand challenge. To
                 facilitate this task, concurrent data structures have
                 been introduced into the standard library of popular
                 languages like Java and C\#. Unfortunately, while the
                 operations exposed by concurrent data structures are
                 atomic (or linearizable), compositions of these
                 operations are not necessarily atomic. Recent studies
                 have found many erroneous implementations of composed
                 concurrent operations. We address the problem of fixing
                 nonlinearizable composed operations such that they
                 behave atomically. We introduce Flint, an automated
                 fixing algorithm for composed Map operations. Flint
                 accepts as input a composed operation suffering from
                 atomicity violations. Its output, if fixing succeeds,
                 is a composed operation that behaves equivalently to
                 the original operation in sequential runs and is
                 guaranteed to be atomic. To our knowledge, Flint is the
                 first general algorithm for fixing incorrect concurrent
                 compositions. We have evaluated Flint on 48 incorrect
                 compositions from 27 popular applications, including
                 Tomcat and MyFaces. The results are highly encouraging:
                 Flint is able to correct 96\% of the methods, and the
                 fixed version is often the same as the fix by an expert
                 programmer and as efficient as the original code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Song:2014:SDR,
  author =       "Linhai Song and Shan Lu",
  title =        "Statistical debugging for real-world performance
                 problems",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "561--578",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660234",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Design and implementation defects that lead to
                 inefficient computation widely exist in software. These
                 defects are difficult to avoid and discover. They lead
                 to severe performance degradation and energy waste
                 during production runs, and are becoming increasingly
                 critical with the meager increase of single-core
                 hardware performance and the increasing concerns about
                 energy constraints. Effective tools that diagnose
                 performance problems and point out the inefficiency
                 root cause are sorely needed. The state of the art of
                 performance diagnosis is preliminary. Profiling can
                 identify the functions that consume the most
                 computation resources, but can neither identify the
                 ones that waste the most resources nor explain why.
                 Performance-bug detectors can identify specific type of
                 inefficient computation, but are not suited for
                 diagnosing general performance problems. Effective
                 failure diagnosis techniques, such as statistical
                 debugging, have been proposed for functional bugs.
                 However, whether they work for performance problems is
                 still an open question. In this paper, we first conduct
                 an empirical study to understand how performance
                 problems are observed and reported by real-world users.
                 Our study shows that statistical debugging is a natural
                 fit for diagnosing performance problems, which are
                 often observed through comparison-based approaches and
                 reported together with both good and bad inputs. We
                 then thoroughly investigate different design points in
                 statistical debugging, including three different
                 predicates and two different types of statistical
                 models, to understand which design point works the best
                 for performance diagnosis. Finally, we study how some
                 unique nature of performance bugs allows sampling
                 techniques to lower the overhead of run-time
                 performance diagnosis without extending the diagnosis
                 latency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Parr:2014:ALP,
  author =       "Terence Parr and Sam Harwell and Kathleen Fisher",
  title =        "Adaptive {LL(*)} parsing: the power of dynamic
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "579--598",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660202",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the advances made by modern parsing strategies
                 such as PEG, LL (*), GLR, and GLL, parsing is not a
                 solved problem. Existing approaches suffer from a
                 number of weaknesses, including difficulties supporting
                 side-effecting embedded actions, slow and/or
                 unpredictable performance, and counter-intuitive
                 matching strategies. This paper introduces the ALL (*)
                 parsing strategy that combines the simplicity,
                 efficiency, and predictability of conventional top-down
                 LL(k) parsers with the power of a GLR-like mechanism to
                 make parsing decisions. The critical innovation is to
                 move grammar analysis to parse-time, which lets ALL(*)
                 handle any non-left-recursive context-free grammar. ALL
                 (*) is O(n$^4$ ) in theory but consistently performs
                 linearly on grammars used in practice, outperforming
                 general strategies such as GLL and GLR by orders of
                 magnitude. ANTLR 4 generates ALL (*) parsers and
                 supports direct left-recursion through grammar
                 rewriting. Widespread ANTLR 4 use (5000 downloads/month
                 in 2013) provides evidence that ALL (*) is effective
                 for a wide variety of applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Gligoric:2014:AMB,
  author =       "Milos Gligoric and Wolfram Schulte and Chandra Prasad
                 and Danny van Velzen and Iman Narasamdya and Benjamin
                 Livshits",
  title =        "Automated migration of build scripts using dynamic
                 analysis and search-based refactoring",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "599--616",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660239",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The efficiency of a build system is an important
                 factor for developer productivity. As a result,
                 developer teams have been increasingly adopting new
                 build systems that allow higher build parallelization.
                 However, migrating the existing legacy build scripts to
                 new build systems is a tedious and error-prone process.
                 Unfortunately, there is insufficient support for
                 automated migration of build scripts, making the
                 migration more problematic. We propose the first
                 dynamic approach for automated migration of build
                 scripts to new build systems. Our approach works in two
                 phases. First, from a set of execution traces, we
                 synthesize build scripts that accurately capture the
                 intent of the original build. The synthesized build
                 scripts are typically long and hard to maintain.
                 Second, we apply refactorings that raise the
                 abstraction level of the synthesized scripts (e.g.,
                 introduce functions for similar fragments). As
                 different refactoring sequences may lead to different
                 build scripts, we use a search-based approach that
                 explores various sequences to identify the best (e.g.,
                 shortest) build script. We optimize search-based
                 refactoring with partial-order reduction to faster
                 explore refactoring sequences. We implemented the
                 proposed two phase migration approach in a tool called
                 METAMORPHOSIS that has been recently used at
                 Microsoft.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Kumar:2014:MCM,
  author =       "Vineet Kumar and Laurie Hendren",
  title =        "{MIX10}: compiling {MATLAB} to {X10} for high
                 performance",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "617--636",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660218",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "MATLAB is a popular dynamic array-based language
                 commonly used by students, scientists and engineers who
                 appreciate the interactive development style, the rich
                 set of array operators, the extensive builtin library,
                 and the fact that they do not have to declare static
                 types. Even though these users like to program in
                 MATLAB, their computations are often very
                 compute-intensive and are better suited for emerging
                 high performance computing systems. This paper reports
                 on MIX10, a source-to-source compiler that
                 automatically translates MATLAB programs to X10, a
                 language designed for ``Performance and Productivity at
                 Scale''; thus, helping scientific programmers make
                 better use of high performance computing systems. There
                 is a large semantic gap between the array-based
                 dynamically-typed nature of MATLAB and the
                 object-oriented, statically-typed, and high-level array
                 abstractions of X10. This paper addresses the major
                 challenges that must be overcome to produce sequential
                 X10 code that is competitive with state-of-the-art
                 static compilers for MATLAB which target more
                 conventional imperative languages such as C and
                 Fortran. Given that efficient basis, the paper then
                 provides a translation for the MATLAB parfor construct
                 that leverages the powerful concurrency constructs in
                 X10. The MIX10 compiler has been implemented using the
                 McLab compiler tools, is open source, and is available
                 both for compiler researchers and end-user MATLAB
                 programmers. We have used the implementation to perform
                 many empirical measurements on a set of 17 MATLAB
                 benchmarks. We show that our best MIX10-generated code
                 is significantly faster than the de facto Mathworks'
                 MATLAB system, and that our results are competitive
                 with state-of-the-art static compilers that target C
                 and Fortran. We also show the importance of finding the
                 correct approach to representing the arrays in the
                 generated X10 code, and the necessity of an IntegerOkay
                 ' analysis that determines which double variables can
                 be safely represented as integers. Finally, we show
                 that our X10-based handling of the MATLAB parfor
                 greatly outperforms the de facto MATLAB
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Jonnalagedda:2014:SPC,
  author =       "Manohar Jonnalagedda and Thierry Coppey and Sandro
                 Stucki and Tiark Rompf and Martin Odersky",
  title =        "Staged parser combinators for efficient data
                 processing",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "637--653",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660241",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parsers are ubiquitous in computing, and many
                 applications depend on their performance for decoding
                 data efficiently. Parser combinators are an intuitive
                 tool for writing parsers: tight integration with the
                 host language enables grammar specifications to be
                 interleaved with processing of parse results.
                 Unfortunately, parser combinators are typically slow
                 due to the high overhead of the host language
                 abstraction mechanisms that enable composition. We
                 present a technique for eliminating such overhead. We
                 use staging, a form of runtime code generation, to
                 dissociate input parsing from parser composition, and
                 eliminate intermediate data structures and computations
                 associated with parser composition at staging time. A
                 key challenge is to maintain support for input
                 dependent grammars, which have no clear stage
                 distinction. Our approach applies to top-down
                 recursive-descent parsers as well as bottom-up
                 non-deterministic parsers with key applications in
                 dynamic programming on sequences, where we
                 auto-generate code for parallel hardware. We achieve
                 performance comparable to specialized, hand-written
                 parsers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Rosner:2014:BET,
  author =       "Nicol{\'a}s Rosner and Valeria Bengolea and Pablo
                 Ponzio and Shadi Abdul Khalek and Nazareno Aguirre and
                 Marcelo F. Frias and Sarfraz Khurshid",
  title =        "Bounded exhaustive test input generation from hybrid
                 invariants",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "655--674",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660232",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a novel technique for producing bounded
                 exhaustive test suites from hybrid invariants, i.e.,
                 invariants that are expressed imperatively,
                 declaratively, or as a combination of declarative and
                 imperative predicates. Hybrid specifications are
                 processed using known mechanisms for the imperative and
                 declarative parts, but combined in a way that enables
                 us to exploit information from the declarative side,
                 such as tight bounds computed from the declarative
                 specification, to improve the search both on the
                 imperative and declarative sides. Moreover, our
                 technique automatically evaluates different possible
                 ways of processing the imperative side, and the
                 alternative settings (imperative or declarative) for
                 parts of the invariant available both declaratively and
                 imperatively, to decide the most convenient invariant
                 configuration with respect to efficiency in test
                 generation. This is achieved by transcoping, i.e., by
                 assessing the efficiency of the different alternatives
                 on small scopes (where generation times are
                 negligible), and then extrapolating the results to
                 larger scopes. We also show experiments involving
                 collection classes that support the effectiveness of
                 our technique, by demonstrating that (i) bounded
                 exhaustive suites can be computed from hybrid
                 invariants significantly more efficiently than doing so
                 using state-of-the-art purely imperative and purely
                 declarative approaches, and (ii) our technique is able
                 to automatically determine efficient hybrid invariants,
                 in the sense that they lead to an efficient computation
                 of bounded exhaustive suites, using transcoping.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Wang:2014:CVM,
  author =       "Peng Wang and Santiago Cuellar and Adam Chlipala",
  title =        "Compiler verification meets cross-language linking via
                 data abstraction",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "675--690",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660201",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many real programs are written in multiple different
                 programming languages, and supporting this pattern
                 creates challenges for formal compiler verification. We
                 describe our Coq verification of a compiler for a
                 high-level language, such that the compiler correctness
                 theorem allows us to derive partial-correctness
                 Hoare-logic theorems for programs built by linking the
                 assembly code output by our compiler and assembly code
                 produced by other means. Our compiler supports such
                 tricky features as storable cross-language function
                 pointers, without giving up the usual benefits of being
                 able to verify different compiler phases (including, in
                 our case, two classic optimizations) independently. The
                 key technical innovation is a mixed operational and
                 axiomatic semantics for the source language, with a
                 built-in notion of abstract data types, such that
                 compiled code interfaces with other languages only
                 through axiomatically specified methods that mutate
                 encapsulated private data, represented in whatever
                 formats are most natural for those languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Turon:2014:GNW,
  author =       "Aaron Turon and Viktor Vafeiadis and Derek Dreyer",
  title =        "{GPS}: navigating weak memory with ghosts, protocols,
                 and separation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "691--707",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660243",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Weak memory models formalize the inconsistent
                 behaviors that one can expect to observe in
                 multithreaded programs running on modern hardware. In
                 so doing, however, they complicate the
                 already-difficult task of reasoning about correctness
                 of concurrent code. Worse, they render impotent the
                 sophisticated formal methods that have been developed
                 to tame concurrency, which almost universally assume a
                 strong ( i.e. sequentially consistent) memory model.
                 This paper introduces GPS, the first program logic to
                 provide a full-fledged suite of modern verification
                 techniques --- including ghost state, protocols, and
                 separation logic --- for high-level, structured
                 reasoning about weak memory. We demonstrate the
                 effectiveness of GPS by applying it to challenging
                 examples drawn from the Linux kernel as well as
                 lock-free data structures. We also define the semantics
                 of GPS and prove in Coq that it is sound with respect
                 to the axiomatic C11 weak memory model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Desai:2014:NPA,
  author =       "Ankush Desai and Pranav Garg and P. Madhusudan",
  title =        "Natural proofs for asynchronous programs using
                 almost-synchronous reductions",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "709--725",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660211",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We consider the problem of provably verifying that an
                 asynchronous message-passing system satisfies its local
                 assertions. We present a novel reduction scheme for
                 asynchronous event-driven programs that finds
                 almost-synchronous invariants --- invariants consisting
                 of global states where message buffers are close to
                 empty. The reduction finds almost-synchronous
                 invariants and simultaneously argues that they cover
                 all local states. We show that asynchronous programs
                 often have almost-synchronous invariants and that we
                 can exploit this to build natural proofs that they are
                 correct. We implement our reduction strategy, which is
                 sound and complete, and show that it is more effective
                 in proving programs correct as well as more efficient
                 in finding bugs in several programs, compared to
                 current search strategies which almost always diverge.
                 The high point of our experiments is that our technique
                 can prove the Windows Phone USB Driver written in P
                 [9]correct for the responsiveness property, which was
                 hitherto not provable using state-of-the-art
                 model-checkers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Zhang:2014:AIO,
  author =       "Wei Zhang and Per Larsen and Stefan Brunthaler and
                 Michael Franz",
  title =        "Accelerating iterators in optimizing {AST}
                 interpreters",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "727--743",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660223",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Generators offer an elegant way to express iterators.
                 However, their performance has always been their
                 Achilles heel and has prevented widespread adoption. We
                 present techniques to efficiently implement and
                 optimize generators. We have implemented our
                 optimizations in ZipPy, a modern, light-weight AST
                 interpreter based Python 3 implementation targeting the
                 Java virtual machine. Our implementation builds on a
                 framework that optimizes AST interpreters using
                 just-in-time compilation. In such a system, it is
                 crucial that AST optimizations do not prevent
                 subsequent optimizations. Our system was carefully
                 designed to avoid this problem. We report an average
                 speedup of 3.58x for generator-bound programs. As a
                 result, using generators no longer has downsides and
                 programmers are free to enjoy their upsides.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Zhao:2014:CSP,
  author =       "Zhijia Zhao and Bo Wu and Mingzhou Zhou and Yufei Ding
                 and Jianhua Sun and Xipeng Shen and Youfeng Wu",
  title =        "Call sequence prediction through probabilistic calling
                 automata",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "745--762",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660221",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Predicting a sequence of upcoming function calls is
                 important for optimizing programs written in modern
                 managed languages (e.g., Java, Javascript, C\#.)
                 Existing function call predictions are mainly built on
                 statistical patterns, suitable for predicting a single
                 call but not a sequence of calls. This paper presents a
                 new way to enable call sequence prediction, which
                 exploits program structures through Probabilistic
                 Calling Automata (PCA), a new program representation
                 that captures both the inherent ensuing relations among
                 function calls, and the probabilistic nature of
                 execution paths. It shows that PCA-based prediction
                 outperforms existing predictions, yielding substantial
                 speedup when being applied to guide Just-In-Time
                 compilation. By enabling accurate, efficient call
                 sequence prediction for the first time, PCA-based
                 predictors open up many new opportunities for dynamic
                 program optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Zhou:2014:SEM,
  author =       "Mingzhou Zhou and Xipeng Shen and Yaoqing Gao and
                 Graham Yiu",
  title =        "Space-efficient multi-versioning for input-adaptive
                 feedback-driven program optimizations",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "763--776",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660229",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Function versioning is an approach to addressing
                 input-sensitivity of program optimizations. A major
                 side effect of it is notable code size increase, which
                 has been hindering its broad applications to large code
                 bases and space-stringent environments. In this paper,
                 we initiate a systematic exploration into the problem,
                 providing answers to some fundamental questions: Given
                 a space constraint, to which function we should apply
                 versioning? How many versions of a function should we
                 include in the final executable? Is the optimal
                 selection feasible to do in polynomial time? This study
                 proves selecting the best set of versions under a space
                 constraint is NP-complete and proposes a heuristic
                 algorithm named CHoGS which yields near optimal results
                 in quadratic time. We implement the algorithm and
                 conduct experiments through the IBM XL compilers. We
                 observe significant performance enhancement with only
                 slight code size increase; the results from CHoGS show
                 factors of higher space efficiency than those from
                 traditional hotness-based methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Adams:2014:HVM,
  author =       "Keith Adams and Jason Evans and Bertrand Maher and
                 Guilherme Ottoni and Andrew Paroski and Brett Simmers
                 and Edwin Smith and Owen Yamauchi",
  title =        "The {HipHop Virtual Machine}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "777--790",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660199",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The HipHop Virtual Machine (HHVM) is a JIT compiler
                 and runtime for PHP. While PHP values are dynamically
                 typed, real programs often have latent types that are
                 useful for optimization once discovered. Some types can
                 be proven through static analysis, but limitations in
                 the ahead-of-time approach leave some types to be
                 discovered at run time. And even though many values
                 have latent types, PHP programs can also contain
                 polymorphic variables and expressions, which must be
                 handled without catastrophic slowdown. HHVM discovers
                 latent types by structuring its JIT around the concept
                 of a tracelet. A tracelet is approximately a basic
                 block specialized for a particular set of run-time
                 types for its input values. Tracelets allow HHVM to
                 exactly and efficiently learn the types observed by the
                 program, while using a simple compiler. This paper
                 shows that this approach enables HHVM to achieve high
                 levels of performance, without sacrificing
                 compatibility or interactivity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Nazare:2014:VMA,
  author =       "Henrique Nazar{\'e} and Izabela Maffra and Willer
                 Santos and Leonardo Barbosa and Laure Gonnord and
                 Fernando Magno Quint{\~a}o Pereira",
  title =        "Validation of memory accesses through symbolic
                 analyses",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "791--809",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660205",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The C programming language does not prevent
                 out-of-bounds memory accesses. There exist several
                 techniques to secure C programs; however, these methods
                 tend to slow down these programs substantially, because
                 they populate the binary code with runtime checks. To
                 deal with this problem, we have designed and tested two
                 static analyses --- symbolic region and range analysis
                 --- which we combine to remove the majority of these
                 guards. In addition to the analyses themselves, we
                 bring two other contributions. First, we describe live
                 range splitting strategies that improve the efficiency
                 and the precision of our analyses. Secondly, we show
                 how to deal with integer overflows, a phenomenon that
                 can compromise the correctness of static algorithms
                 that validate memory accesses. We validate our claims
                 by incorporating our findings into AddressSanitizer. We
                 generate SPEC CINT 2006 code that is 17\% faster and
                 9\% more energy efficient than the code produced
                 originally by this tool. Furthermore, our approach is
                 50\% more effective than Pentagons, a state-of-the-art
                 analysis to sanitize memory accesses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Partush:2014:ASD,
  author =       "Nimrod Partush and Eran Yahav",
  title =        "Abstract semantic differencing via speculative
                 correlation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "811--828",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660245",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We address the problem of computing semantic
                 differences between a program and a patched version of
                 the program. Our goal is to obtain a precise
                 characterization of the difference between program
                 versions, or establish their equivalence. We focus on
                 infinite-state numerical programs, and use abstract
                 interpretation to compute an over-approximation of
                 program differences. Computing differences and
                 establishing equivalence under abstraction requires
                 abstracting relationships between variables in the two
                 programs. Towards that end, we use a correlating
                 abstract domain to compute a sound approximation of
                 these relationships which captures semantic difference.
                 This approximation can be computed over any
                 interleaving of the two programs. However, the choice
                 of interleaving can significantly affect precision. We
                 present a speculative search algorithm that aims to
                 find an interleaving of the two programs with minimal
                 abstract semantic difference. This method is unique as
                 it allows the analysis to dynamically alternate between
                 several interleavings. We have implemented our approach
                 and applied it to real-world examples including patches
                 from Git, GNU Coreutils, as well as a few handpicked
                 patches from the Linux kernel and the Mozilla Firefox
                 web browser. Our evaluation shows that we compute
                 precise approximations of semantic differences, and
                 report few false differences.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Zhang:2014:ESA,
  author =       "Qirun Zhang and Xiao Xiao and Charles Zhang and Hao
                 Yuan and Zhendong Su",
  title =        "Efficient subcubic alias analysis for {C}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "829--845",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660213",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Inclusion-based alias analysis for C can be formulated
                 as a context-free language (CFL) reachability problem.
                 It is well known that the traditional cubic
                 CFL-reachability algorithm does not scale well in
                 practice. We present a highly scalable and efficient
                 CFL-reachability-based alias analysis for C. The key
                 novelty of our algorithm is to propagate reachability
                 information along only original graph edges and bypass
                 a large portion of summary edges, while the traditional
                 CFL-reachability algorithm propagates along all summary
                 edges. We also utilize the Four Russians' Trick --- a
                 key enabling technique in the subcubic CFL-reachability
                 algorithm --- in our alias analysis. We have
                 implemented our subcubic alias analysis and conducted
                 extensive experiments on widely-used C programs from
                 the pointer analysis literature. The results
                 demonstrate that our alias analysis scales extremely
                 well in practice. In particular, it can analyze the
                 recent Linux kernel (which consists of 10M SLOC) in
                 about 30 seconds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Brutschy:2014:SAI,
  author =       "Lucas Brutschy and Pietro Ferrara and Peter
                 M{\"u}ller",
  title =        "Static analysis for independent app developers",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "847--860",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660219",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mobile app markets have lowered the barrier to market
                 entry for software producers. As a consequence, an
                 increasing number of independent app developers offer
                 their products, and recent platforms such as the MIT
                 App Inventor and Microsoft's TouchDevelop enable even
                 lay programmers to develop apps and distribute them in
                 app markets. A major challenge in this distribution
                 model is to ensure the quality of apps. Besides the
                 usual sources of software errors, mobile apps are
                 susceptible to errors caused by the non-determinism of
                 an event-based execution model, a volatile environment,
                 diverse hardware, and others. Many of these errors are
                 difficult to detect during testing, especially for
                 independent app developers, who are not supported by
                 test teams and elaborate test infrastructures. To
                 address this problem, we propose a static program
                 analysis that captures the specifics of mobile apps and
                 is efficient enough to provide feedback during the
                 development process. Experiments involving 51,456
                 published TouchDevelop scripts show that our analysis
                 analyzes 98\% of the scripts in under a minute, and
                 five seconds on average. Manual inspection of the
                 analysis results for a selection of all scripts shows
                 that most of the alarms are real errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Vora:2014:AEA,
  author =       "Keval Vora and Sai Charan Koduru and Rajiv Gupta",
  title =        "{ASPIRE}: exploiting asynchronous parallelism in
                 iterative algorithms using a relaxed consistency based
                 {DSM}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "861--878",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660227",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many vertex-centric graph algorithms can be expressed
                 using asynchronous parallelism by relaxing certain
                 read-after-write data dependences and allowing threads
                 to compute vertex values using stale (i.e., not the
                 most recent) values of their neighboring vertices. We
                 observe that on distributed shared memory systems, by
                 converting synchronous algorithms into their
                 asynchronous counterparts, algorithms can be made
                 tolerant to high inter-node communication latency.
                 However, high inter-node communication latency can lead
                 to excessive use of stale values causing an increase in
                 the number of iterations required by the algorithms to
                 converge. Although by using bounded staleness we can
                 restrict the slowdown in the rate of convergence, this
                 also restricts the ability to tolerate communication
                 latency. In this paper we design a relaxed memory
                 consistency model and consistency protocol that
                 simultaneously tolerate communication latency and
                 minimize the use of stale values. This is achieved via
                 a coordinated use of best effort refresh policy and
                 bounded staleness. We demonstrate that for a range of
                 asynchronous graph algorithms and PDE solvers, on an
                 average, our approach outperforms algorithms based
                 upon: prior relaxed memory models that allow stale
                 values by at least 2.27x; and Bulk Synchronous Parallel
                 (BSP) model by 4.2x. We also show that our approach
                 frequently outperforms GraphLab, a popular distributed
                 graph processing framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Holt:2014:AAL,
  author =       "Brandon Holt and Preston Briggs and Luis Ceze and Mark
                 Oskin",
  title =        "{Alembic}: automatic locality extraction via
                 migration",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "879--894",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660194",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Partitioned Global Address Space (PGAS) environments
                 simplify writing parallel code for clusters because
                 they make data movement implicit --- dereferencing
                 global pointers automatically moves data around.
                 However, it does not free the programmer from needing
                 to reason about locality --- poor placement of data can
                 lead to excessive and even unnecessary communication.
                 For this reason, modern PGAS languages such as X10,
                 Chapel, and UPC allow programmers to express
                 data-layout constraints and explicitly move
                 computation. This places an extra burden on the
                 programmer, and is less effective for applications with
                 limited or data-dependent locality (e.g., graph
                 analytics). This paper proposes Alembic, a new static
                 analysis that frees programmers from having to manually
                 move computation to exploit locality in PGAS programs.
                 It works by determining regions of code that access the
                 same cluster node, then transforming the code to
                 migrate parts of the execution to increase the
                 proportion of accesses to local data. We implement the
                 analysis and transformation for C++ in LLVM and show
                 that in irregular application kernels, Alembic can
                 achieve 82\% of the performance of hand-tuned
                 communication (for comparison, na{\"\i}ve
                 compiler-generated communication achieves only 13\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Xiao:2014:CPL,
  author =       "Tian Xiao and Zhenyu Guo and Hucheng Zhou and Jiaxing
                 Zhang and Xu Zhao and Chencheng Ye and Xi Wang and Wei
                 Lin and Wenguang Chen and Lidong Zhou",
  title =        "{Cybertron}: pushing the limit on {I/O} reduction in
                 data-parallel programs",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "895--908",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660204",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "I/O reduction has been a major focus in optimizing
                 data-parallel programs for big-data processing. While
                 the current state-of-the-art techniques use static
                 program analysis to reduce I/O, Cybertron proposes a
                 new direction that incorporates runtime mechanisms to
                 push the limit further on I/O reduction. In particular,
                 Cybertron tracks how data is used in the computation
                 accurately at runtime to filter unused data at finer
                 granularity dynamically, beyond what current
                 static-analysis based mechanisms are capable of, and to
                 facilitate a new mechanism called constraint based
                 encoding for more efficient encoding. Cybertron has
                 been implemented and applied to production
                 data-parallel programs; our extensive evaluations on
                 real programs and real data have shown its
                 effectiveness on I/O reduction over the existing
                 mechanisms at reasonable CPU cost, and its improvement
                 on end-to-end performance in various network
                 environments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Radoi:2014:TIC,
  author =       "Cosmin Radoi and Stephen J. Fink and Rodric Rabbah and
                 Manu Sridharan",
  title =        "Translating imperative code to {MapReduce}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "10",
  pages =        "909--927",
  month =        oct,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2714064.2660228",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an approach for automatic translation of
                 sequential, imperative code into a parallel MapReduce
                 framework. Automating such a translation is
                 challenging: imperative updates must be translated into
                 a functional MapReduce form in a manner that both
                 preserves semantics and enables parallelism. Our
                 approach works by first translating the input code into
                 a functional representation, with loops succinctly
                 represented by fold operations. Then, guided by rewrite
                 rules, our system searches a space of equivalent
                 programs for an effective MapReduce implementation. The
                 rules include a novel technique for handling irregular
                 loop-carried dependencies using group-by operations to
                 enable greater parallelism. We have implemented our
                 technique in a tool called Mold. It translates
                 sequential Java code into code targeting the Apache
                 Spark runtime. We evaluated Mold on several real-world
                 kernels and found that in most cases Mold generated the
                 desired MapReduce program, even for codes with complex
                 indirect updates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '14 conference proceedings.",
}

@Article{Guyer:2014:UJT,
  author =       "Samuel Z. Guyer",
  title =        "Use of the {JVM} at {Twitter}: a bird's eye view",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "1--1",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2619208",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Specialties: 15+ years of virtual machine
                 implementation experience with special focus on memory
                 management / garbage collection. Close to 20 years of
                 C/C++ experience. 15+ years of Java experience. Expert
                 in concurrent/parallel programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Terei:2014:MHP,
  author =       "David Terei and Alex Aiken and Jan Vitek",
  title =        "{$ M^3 $}: high-performance memory management from
                 off-the-shelf components",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "3--13",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602995",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Real-world garbage collectors in managed languages are
                 complex. We investigate whether this complexity is
                 really necessary and show that by having a different
                 (but wider) interface between the collector and the
                 developer, we can achieve high performance with
                 off-the-shelf components for real applications. We
                 propose to assemble a memory manager out of multiple,
                 simple collection strategies and to expose the choice
                 of where to use those strategies in the program to the
                 developer. We describe and evaluate an instantiation of
                 our design for C. Our prototype allows developers to
                 choose on a per-type basis whether data should be
                 reference counted or reclaimed by a tracing collector.
                 While neither strategy is optimised, our empirical data
                 shows that we can achieve performance that is
                 competitive with hand-tuned C code for real-world
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Clifford:2014:AFB,
  author =       "Daniel Clifford and Hannes Payer and Michael
                 Starzinger and Ben L. Titzer",
  title =        "Allocation folding based on dominance",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "15--24",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602994",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Memory management system performance is of increasing
                 importance in today's managed languages. Two lingering
                 sources of overhead are the direct costs of memory
                 allocations and write barriers. This paper introduces
                 it allocation folding, an optimization technique where
                 the virtual machine automatically folds multiple memory
                 allocation operations in optimized code together into a
                 single, larger it allocation group. An allocation group
                 comprises multiple objects and requires just a single
                 bounds check in a bump-pointer style allocation, rather
                 than a check for each individual object. More
                 importantly, all objects allocated in a single
                 allocation group are guaranteed to be contiguous after
                 allocation and thus exist in the same generation, which
                 makes it possible to statically remove write barriers
                 for reference stores involving objects in the same
                 allocation group. Unlike object inlining, object
                 fusing, and object colocation, allocation folding
                 requires no special connectivity or ownership relation
                 between the objects in an allocation group. We present
                 our analysis algorithm to determine when it is safe to
                 fold allocations together and discuss our
                 implementation in V8, an open-source, production
                 JavaScript virtual machine. We present performance
                 results for the Octane and Kraken benchmark suites and
                 show that allocation folding is a strong performance
                 improvement, even in the presence of some heap
                 fragmentation. Additionally, we use four hand-selected
                 benchmarks JPEGEncoder, NBody, Soft3D, and Textwriter
                 where allocation folding has a large impact.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Ratnakar:2014:PPC,
  author =       "Bollu Ratnakar and Rupesh Nasre",
  title =        "Push-pull constraint graph for efficient points-to
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "25--33",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602989",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present techniques for efficient computation of
                 points-to information for C programs. Pointer analysis
                 is an important phase in the compilation process. The
                 computed points-to information and the alias
                 information is useful for client analyses from varied
                 domains such as bug finding, data-flow analysis,
                 identifying security vulnerabilities, and
                 parallelization, to name a few. Former research on
                 pointer analysis has indicated that the main bottleneck
                 towards scalability is manifested by the presence of
                 complex constraints (load p = *q and store *p = q
                 constraints) in the program. Complex constraints add
                 edges to the constraint graph in an unpredictable
                 manner and are responsible for initiating propagation
                 of large amounts of points-to information across edges.
                 We identify that the root cause to this issue is in the
                 homogeneous structure in the constraint graph, due to
                 which existing analyses treat loads and stores in a
                 uniform manner. To address these issues, we present two
                 techniques. First, we represent a constraint graph in a
                 non-homogeneous manner, treat loads and stores in
                 different ways, and employ a push-pull model for
                 non-uniform propagation. Second, we propose lazy
                 propagation which propagates information in the
                 constraint graph only when necessary. We illustrate the
                 effectiveness of our techniques using six large
                 open-source programs and show that they improve the
                 analysis time over a state-of-the-art BDD-based
                 analysis by 33\% and over Deep Propagation by 21\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Joisha:2014:STF,
  author =       "Pramod G. Joisha",
  title =        "Sticky tries: fast insertions, fast lookups, no
                 deletions for large key universes",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "35--46",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602998",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the sticky trie, a new variant of the
                 standard trie data structure that achieves
                 high-performing atomic insertions and lookups for large
                 key universes by precluding deletions. It has
                 applications in several areas, including address
                 tracking, logging, and garbage collection. By
                 leveraging features of a modern operating system, we
                 show how a runtime can exploit the absence of deletions
                 to realize an efficient sticky-trie implementation. We
                 report on an evaluation of two representative uses ---
                 compelling Bloom-filter alternative and fast substitute
                 for a garbage collector's sequential store buffer
                 (SSB). We demonstrate that a sticky trie, when compared
                 with what is perhaps among the simplest Bloom filters,
                 can be over 43\% faster, scale substantially better
                 with increasing threads, and yet be free of false
                 positives. By introducing the concept of an ideal SSB,
                 we also demonstrate that a sticky trie could be
                 competitive in performance with a class of SSBs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Brandt:2014:CPG,
  author =       "Steven R. Brandt and Hari Krishnan and Gokarna Sharma
                 and Costas Busch",
  title =        "Concurrent, parallel garbage collection in linear
                 time",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "47--58",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602990",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a new concurrent garbage
                 collection algorithm based on two types of reference,
                 strong and weak, to link the graph of objects. Strong
                 references connect the roots to all the nodes in the
                 graph but do not contain cycles. Weak references may,
                 however, contain cycles. Advantages of this system
                 include: (1) reduced processing, non-trivial garbage
                 collection work is only required when the last strong
                 reference is lost; (2) fewer memory traces to delete
                 objects, a garbage cycle only needs to be traversed
                 twice to be deleted; (3) fewer memory traces to retain
                 objects, since the collector can often prove objects
                 are reachable without fully tracing support cycles to
                 which the objects belong; (4) concurrency, it can run
                 in parallel with a live system without ``stopping the
                 world''; (5) parallel, because collection operations in
                 different parts of the memory can proceed at the same
                 time. Previous variants of this technique required
                 exponential cleanup time, but our algorithm is linear
                 in total time, i.e. any changes in the graph take only
                 O(N) time steps, where N is the number of edges in the
                 affected subgraph (e.g. the subgraph whose strong
                 support is affected by the operations).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Ugawa:2014:ROP,
  author =       "Tomoharu Ugawa and Richard E. Jones and Carl G.
                 Ritson",
  title =        "Reference object processing in on-the-fly garbage
                 collection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "59--69",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602991",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most proposals for on-the-fly garbage collection
                 ignore the question of Java's weak and other reference
                 types. However, we show that reference types are
                 heavily used in DaCapo benchmarks. Of the few
                 collectors that do address this issue, most block
                 mutators, either globally or individually, while
                 processing reference types. We introduce a new
                 framework for processing reference types on-the-fly in
                 Jikes RVM. Our framework supports both insertion and
                 deletion write barriers. We have model checked our
                 algorithm and incorporated it in our new implementation
                 of the Sapphire on-the-fly collector. Using a deletion
                 barrier, we process references while mutators are
                 running in less than three times the time that previous
                 approaches take while mutators are halted; our overall
                 execution times are no worse, and often better.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Li:2014:MHD,
  author =       "Pengcheng Li and Chen Ding and Hao Luo",
  title =        "Modeling heap data growth using average liveness",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "71--82",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602997;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Most of today's programs make use of a sizable heap to
                 store dynamic data. To characterize the heap dynamics,
                 this paper presents a set of metrics to measure the
                 average amount of data live and dead in a period of
                 execution. They are collectively called average
                 liveness. The paper defines these metrics of average
                 liveness, gives linear-time algorithms for measurement,
                 and discusses their use in finding the best heap size.
                 The algorithms are implemented in a Java tracing system
                 called Elephant Tracks and evaluated using the Dacapo
                 benchmarks running on the Oracle HotSpot and IBM J9
                 Java virtual machines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Zakkak:2014:JJM,
  author =       "Foivos S. Zakkak and Polyvios Pratikakis",
  title =        "{JDMM}: a {Java} memory model for non-cache-coherent
                 memory architectures",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "83--92",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602999",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "As the number of cores continuously grows, processor
                 designers are considering non coherent memories as more
                 scalable and energy efficient alternatives to the
                 current coherent ones. The Java Memory Model (JMM)
                 requires that all cores can access the Java heap. It
                 guarantees sequential consistency for data-race-free
                 programs and no out-of-thin-air values for non
                 data-race-free programs. To implement the Java Memory
                 Model over non-cache-coherent and distributed
                 architectures Java Virtual Machines (JVMs) are most
                 likely to employ software caching. In this work, (i) we
                 provide a formalization of the Java Memory Model for
                 non-cache-coherent and distributed memory
                 architectures, (ii) prove the adherence of our model
                 with the Java Memory Model and (iii) evaluate,
                 regarding its compliance to the Java Memory Model, a
                 state-of-the-art Java Virtual Machine implementation on
                 a non-cache-coherent architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Egielski:2014:MAM,
  author =       "Ian J. Egielski and Jesse Huang and Eddy Z. Zhang",
  title =        "Massive atomics for massive parallelism on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "93--103",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602993",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "One important type of parallelism exploited in many
                 applications is reduction type parallelism. In these
                 applications, the order of the read-modify-write
                 updates to one shared data object can be arbitrary as
                 long as there is an imposed order for the
                 read-modify-write updates. The typical way to
                 parallelize these types of applications is to first let
                 every individual thread perform local computation and
                 save the results in thread-private data objects, and
                 then merge the results from all worker threads in the
                 reduction stage. All applications that fit into the map
                 reduce framework belong to this category. Additionally,
                 the machine learning, data mining, numerical analysis
                 and scientific simulation applications may also benefit
                 from reduction type parallelism. However, the
                 parallelization scheme via the usage of thread-private
                 data objects may not be vi- able in massively parallel
                 GPU applications. Because the number of concurrent
                 threads is extremely large (at least tens of thousands
                 of), thread-private data object creation may lead to
                 memory space explosion problems. In this paper, we
                 propose a novel approach to deal with shared data
                 object management for reduction type parallelism on
                 GPUs. Our approach exploits fine-grained parallelism
                 while at the same time maintaining good
                 programmability. It is based on the usage of intrinsic
                 hardware atomic instructions. Atomic operation may
                 appear to be expensive since it causes thread
                 serialization when multiple threads atomically update
                 the same memory object at the same time. However, we
                 discovered that, with appropriate atomic collision
                 reduction techniques, the atomic implementation can
                 out- perform the non-atomics implementation, even for
                 benchmarks known to have high performance non-atomics
                 GPU implementations. In the meantime, the usage of
                 atomics can greatly reduce coding complexity as neither
                 thread-private object management or explicit
                 thread-communication (for the shared data objects
                 protected by atomic operations) is necessary.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Ritson:2014:EGC,
  author =       "Carl G. Ritson and Tomoharu Ugawa and Richard E.
                 Jones",
  title =        "Exploring garbage collection with {Haswell} hardware
                 transactional memory",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "105--115",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602992",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Intel's latest processor microarchitecture, Haswell,
                 adds support for a restricted form of transactional
                 memory to the x86 programming model. We explore how
                 this can be applied to three garbage collection
                 scenarios in Jikes RVM: parallel copying, concurrent
                 copying and bitmap marking. We demonstrate gains in
                 concurrent copying speed over traditional
                 synchronisation mechanisms of 48-101\%. We also show
                 how similar but portable performance gains can be
                 achieved through software transactional memory
                 techniques. We identify the architectural overhead of
                 capturing sufficient work for transactional execution
                 as a major stumbling block to the effective use of
                 transactions in the other scenarios.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Bacon:2014:PRT,
  author =       "David F. Bacon and Perry Cheng and Sunil Shukla",
  title =        "Parallel real-time garbage collection of multiple
                 heaps in reconfigurable hardware",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "11",
  pages =        "117--127",
  month =        nov,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775049.2602996",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite rapid increases in memory capacity,
                 reconfigurable hardware is still programmed in a very
                 low-level manner, generally without any dynamic
                 allocation at all. This limits productivity especially
                 as the larger chips encourage more and more complex
                 designs to be attempted. Prior work has shown that it
                 is possible to implement a real-time collector in
                 hardware and achieve stall-free operation --- but at
                 the price of severe restrictions on object layouts. We
                 present the first hardware garbage collector capable of
                 collecting multiple inter-connected heaps, thereby
                 allowing a rich set of object types. We show that for a
                 modest additional cost in logic and memory, we can
                 support multiple heaps at a clock frequency competitive
                 with monolithic, fixed-layout heaps. We evaluate the
                 hardware design by synthesizing it for a Xilinx FPGA
                 and using co-simulation to measure the run-time
                 behavior over a set of four benchmarks. Even at high
                 allocation and mutation rates the collector is able to
                 sustain stall-free (100\% minimum mutator utilization)
                 operation with up to 4 inter-connected heaps, while
                 only requiring between 1.1 and 1.7 times the maximum
                 live memory of the application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '14 conference proceedings.",
}

@Article{Wu:2014:EHS,
  author =       "Nicolas Wu and Tom Schrijvers and Ralf Hinze",
  title =        "Effect handlers in scope",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "1--12",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633358",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Algebraic effect handlers are a powerful means for
                 describing effectful computations. They provide a
                 lightweight and orthogonal technique to define and
                 compose the syntax and semantics of different effects.
                 The semantics is captured by handlers, which are
                 functions that transform syntax trees. Unfortunately,
                 the approach does not support syntax for scoping
                 constructs, which arise in a number of scenarios. While
                 handlers can be used to provide a limited form of
                 scope, we demonstrate that this approach constrains the
                 possible interactions of effects and rules out some
                 desired semantics. This paper presents two different
                 ways to capture scoped constructs in syntax, and shows
                 how to achieve different semantics by reordering
                 handlers. The first approach expresses scopes using the
                 existing algebraic handlers framework, but has some
                 limitations. The problem is fully solved in the second
                 approach where we introduce higher-order syntax.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Orchard:2014:EES,
  author =       "Dominic Orchard and Tomas Petricek",
  title =        "Embedding effect systems in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "13--24",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633368",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Monads are now an everyday tool in functional
                 programming for abstracting and delimiting effects. The
                 link between monads and effect systems is well-known,
                 but in their typical use, monads provide a much more
                 coarse-grained view of effects. Effect systems capture
                 fine-grained information about the effects, but monads
                 provide only a binary view: effectful or pure. Recent
                 theoretical work has unified fine-grained effect
                 systems with monads using a monad-like structure
                 indexed by a monoid of effect annotations (called
                 parametric effect monads). This aligns the power of
                 monads with the power of effect systems. This paper
                 leverages recent advances in Haskell's type system (as
                 provided by GHC) to embed this approach in Haskell,
                 providing user-programmable effect systems. We explore
                 a number of practical examples that make Haskell even
                 better and safer for effectful programming. Along the
                 way, we relate the examples to other concepts, such as
                 Haskell's implicit parameters and coeffects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Blanchette:2014:ERN,
  author =       "Jasmin Christian Blanchette and Lars Hupel and Tobias
                 Nipkow and Lars Noschinski and Dmitriy Traytel",
  title =        "Experience report: the next 1100 {Haskell}
                 programmers",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "25--30",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633359",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We report on our experience teaching a Haskell-based
                 functional programming course to over 1100 students for
                 two winter terms. The syllabus was organized around
                 selected material from various sources. Throughout the
                 terms, we emphasized correctness through QuickCheck
                 tests and proofs by induction. The submission
                 architecture was coupled with automatic testing, giving
                 students the possibility to correct mistakes before the
                 deadline. To motivate the students, we complemented the
                 weekly assignments with an informal competition and
                 gave away trophies in a award ceremony.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Muranushi:2014:ERT,
  author =       "Takayuki Muranushi and Richard A. Eisenberg",
  title =        "Experience report: type-checking polymorphic units for
                 astrophysics research in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "31--38",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633362",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many of the bugs in scientific programs have their
                 roots in mistreatment of physical dimensions, via
                 erroneous expressions in the quantity calculus. Now
                 that the type system in the Glasgow Haskell Compiler is
                 rich enough to support type-level integers and other
                 promoted datatypes, we can type-check the quantity
                 calculus in Haskell. In addition to basic
                 dimension-aware arithmetic and unit conversions, our
                 units library features an extensible system of
                 dimensions and units, a notion of dimensions apart from
                 that of units, and unit polymorphism designed to
                 describe the laws of physics. We demonstrate the
                 utility of units by writing an astrophysics research
                 paper. This work is free of unit concerns because every
                 quantity expression in the paper is rigorously
                 type-checked.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Vazou:2014:LER,
  author =       "Niki Vazou and Eric L. Seidel and Ranjit Jhala",
  title =        "{LiquidHaskell}: experience with refinement types in
                 the real world",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "39--51",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633366",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Haskell has many delightful features. Perhaps the one
                 most beloved by its users is its type system that
                 allows developers to specify and verify a variety of
                 program properties at compile time. However, many
                 properties, typically those that depend on
                 relationships between program values are impossible, or
                 at the very least, cumbersome to encode within the
                 existing type system. Many such properties can be
                 verified using a combination of Refinement Types and
                 external SMT solvers. We describe the refinement type
                 checker liquidHaskell, which we have used to specify
                 and verify a variety of properties of over 10,000 lines
                 of Haskell code from various popular libraries,
                 including containers, hscolour, bytestring, text,
                 vector-algorithms and xmonad. First, we present a
                 high-level overview of liquidHaskell, through a tour of
                 its features. Second, we present a qualitative
                 discussion of the kinds of properties that can be
                 checked --- ranging from generic application
                 independent criteria like totality and termination, to
                 application specific concerns like memory safety and
                 data structure correctness invariants. Finally, we
                 present a quantitative evaluation of the approach, with
                 a view towards measuring the efficiency and programmer
                 effort required for verification, and discuss the
                 limitations of the approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Pike:2014:SAE,
  author =       "Lee Pike",
  title =        "{SmartCheck}: automatic and efficient counterexample
                 reduction and generalization",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "53--64",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633365",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "QuickCheck is a powerful library for automatic
                 test-case generation. Because QuickCheck performs
                 random testing, some of the counterexamples discovered
                 are very large. QuickCheck provides an interface for
                 the user to write shrink functions to attempt to reduce
                 the size of counter examples. Hand-written
                 implementations of shrink can be complex, inefficient,
                 and consist of significant boilerplate code.
                 Furthermore, shrinking is only one aspect in debugging:
                 counterexample generalization is the process of
                 extrapolating from individual counterexamples to a
                 class of counterexamples, often requiring a flash of
                 insight from the programmer. To improve counterexample
                 reduction and generalization, we introduce SmartCheck.
                 SmartCheck is a debugging tool that reduces algebraic
                 data using generic search heuristics to efficiently
                 find smaller counterexamples. In addition to shrinking,
                 SmartCheck also automatically generalizes
                 counterexamples to formulas representing classes of
                 counterexamples. SmartCheck has been implemented for
                 Haskell and is freely available.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Maier:2014:HDS,
  author =       "Patrick Maier and Robert Stewart and Phil Trinder",
  title =        "The {HdpH DSLs} for scalable reliable computation",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "65--76",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633363",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The statelessness of functional computations
                 facilitates both parallelism and fault recovery. Faults
                 and non-uniform communication topologies are key
                 challenges for emergent large scale parallel
                 architectures. We report on HdpH and HdpH-RS, a pair of
                 Haskell DSLs designed to address these challenges for
                 irregular task-parallel computations on large
                 distributed-memory architectures. Both DSLs share an
                 API combining explicit task placement with
                 sophisticated work stealing. HdpH focuses on
                 scalability by making placement and stealing topology
                 aware whereas HdpH-RS delivers reliability by means of
                 fault tolerant work stealing. We present operational
                 semantics for both DSLs and investigate conditions for
                 semantic equivalence of HdpH and HdpH-RS programs, that
                 is, conditions under which topology awareness can be
                 transparently traded for fault tolerance. We detail how
                 the DSL implementations realise topology awareness and
                 fault tolerance. We report an initial evaluation of
                 scalability and fault tolerance on a 256-core cluster
                 and on up to 32K cores of an HPC platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Okabe:2014:SDW,
  author =       "Kiwamu Okabe and Takayuki Muranushi",
  title =        "Systems demonstration: writing {NetBSD} sound drivers
                 in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "77--78",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633370",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most strongly typed, functional programming languages
                 are not equipped with a reentrant garbage collector.
                 Therefore such languages are not used for operating
                 systems programming, where the virtues of types are
                 most desired. We propose the use of Context-Local Heaps
                 (CLHs) to achieve reentrancy, which also increasing the
                 speed of garbage collection. We have implemented CLHs
                 in Ajhc, a Haskell compiler derived from jhc, rewritten
                 some NetBSD sound drivers using Ajhc, and benchmarked
                 them. The reentrant, faster garbage collection that
                 CLHs provide opens the path to type-assisted operating
                 systems programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Ekblad:2014:SCC,
  author =       "Anton Ekblad and Koen Claessen",
  title =        "A seamless, client-centric programming model for type
                 safe web applications",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "79--89",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633367",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a new programming model for web
                 applications which is (1) seamless; one program and one
                 language is used to produce code for both client and
                 server, (2) client-centric; the programmer takes the
                 viewpoint of the client that runs code on the server
                 rather than the other way around, (3) functional and
                 type-safe, and (4) portable; everything is implemented
                 as a Haskell library that implicitly takes care of all
                 networking code. Our aim is to improve the painful and
                 error-prone experience of today's standard development
                 methods, in which clients and servers are coded in
                 different languages and communicate with each other
                 using ad-hoc protocols. We present the design of our
                 library called Haste.App, an example web application
                 that uses it, and discuss the implementation and the
                 compiler technology on which it depends.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Levy:2014:DPM,
  author =       "Amit A. Levy and David Terei and Deian Stefan and
                 David Mazi{\'e}res",
  title =        "Demo proposal: making web applications --- {XSafe}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "91--91",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633373",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Simple is a web framework for Haskell. Simple came out
                 of our work on Hails, a platform for secure web
                 applications. For Hails, we needed a flexible web
                 framework that uses no unsafe language features and can
                 be used to build apps outside the IO monad. Unlike many
                 mainstream web frameworks, Simple does not enforce a
                 particular structure or paradigm. Instead, it simply
                 provides a set of composable building blocks to help
                 developers structure and organize their web
                 applications. We've used Simple to build both
                 traditional web applications as well as applications
                 with explicit, strong safety and security guarantees.
                 In the demonstration, we'll focus on the former ---
                 introducing the framework and motivating it's utility
                 for traditional web apps --- and show how we can
                 leverage the LIO information flow control library to
                 add mandatory security policies to apps.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Stefan:2014:BSS,
  author =       "Deian Stefan and Amit Levy and Alejandro Russo and
                 David Mazi{\'e}res",
  title =        "Building secure systems with {LIO} (demo)",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "93--94",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633371",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "LIO is a decentralized information flow control (DIFC)
                 system, implemented in Haskell. In this demo proposal,
                 we give an overview of the LIO library and show how LIO
                 can be used to build secure systems. In particular, we
                 show how to specify high-level security policies in the
                 context of web applications, and describe how LIO
                 automatically enforces these policies even in the
                 presence of untrusted code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Eisenberg:2014:PFT,
  author =       "Richard A. Eisenberg and Jan Stolarek",
  title =        "Promoting functions to type families in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "95--106",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633361",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Haskell, as implemented in the Glasgow Haskell
                 Compiler (GHC), is enriched with many extensions that
                 support type-level programming, such as promoted
                 datatypes, kind polymorphism, and type families. Yet,
                 the expressiveness of the type-level language remains
                 limited. It is missing many features present at the
                 term level, including case expressions, anonymous
                 functions, partially-applied functions, and let
                 expressions. In this paper, we present an algorithm ---
                 with a proof of correctness --- to encode these
                 term-level constructs at the type level. Our approach
                 is automated and capable of promoting a wide array of
                 functions to type families. We also highlight and
                 discuss those term-level features that are not
                 promotable. In so doing, we offer a critique on GHC's
                 existing type system, showing what it is already
                 capable of and where it may want improvement. We
                 believe that delineating the mismatch between GHC's
                 term level and its type level is a key step toward
                 supporting dependently typed programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Morris:2014:SSH,
  author =       "J. Garrett Morris",
  title =        "A simple semantics for {Haskell} overloading",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "107--118",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633364",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As originally proposed, type classes provide
                 overloading and ad-hoc definition, but can still be
                 understood (and implemented) in terms of strictly
                 parametric calculi. This is not true of subsequent
                 extensions of type classes. Functional dependencies and
                 equality constraints allow the satisfiability of
                 predicates to refine typing; this means that the
                 interpretations of equivalent qualified types may not
                 be interconvertible. Overlapping instances and instance
                 chains allow predicates to be satisfied without
                 determining the implementations of their associated
                 class methods, introducing truly non-parametric
                 behavior. We propose a new approach to the semantics of
                 type classes, interpreting polymorphic expressions by
                 the behavior of each of their ground instances, but
                 without requiring that those behaviors be
                 parametrically determined. We argue that this approach
                 both matches the intuitive meanings of qualified types
                 and accurately models the behavior of programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Chakravarty:2014:FIC,
  author =       "Manuel M. T. Chakravarty",
  title =        "Foreign inline code: systems demonstration",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "119--120",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633372",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Adams:2014:ISP,
  author =       "Michael D. Adams and {\"O}mer S. Agacan",
  title =        "Indentation-sensitive parsing for {Parsec}",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "121--132",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633369",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Several popular languages including Haskell and Python
                 use the indentation and layout of code as an essential
                 part of their syntax. In the past, implementations of
                 these languages used ad hoc techniques to implement
                 layout. Recent work has shown that a simple extension
                 to context-free grammars can replace these ad hoc
                 techniques and provide both formal foundations and
                 efficient parsing algorithms for indentation
                 sensitivity. However, that previous work is limited to
                 bottom-up, LR($k$) parsing, and many combinator-based
                 parsing frameworks including Parsec use top-down
                 algorithms that are outside its scope. This paper
                 remedies this by showing how to add indentation
                 sensitivity to parsing frameworks like Parsec. It
                 explores both the formal semantics of and efficient
                 algorithms for indentation sensitivity. It derives a
                 Parsec-based library for indentation-sensitive parsing
                 and presents benchmarks on a real-world language that
                 show its efficiency and practicality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{vanderPloeg:2014:RRR,
  author =       "Atze van der Ploeg and Oleg Kiselyov",
  title =        "Reflection without remorse: revealing a hidden
                 sequence to speed up monadic reflection",
  journal =      j-SIGPLAN,
  volume =       "49",
  number =       "12",
  pages =        "133--144",
  month =        dec,
  year =         "2014",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775050.2633360",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A series of list appends or monadic binds for many
                 monads performs algorithmically worse when
                 left-associated. Continuation-passing style (CPS) is
                 well-known to cure this severe dependence of
                 performance on the association pattern. The advantage
                 of CPS dwindles or disappears if we have to examine or
                 modify the intermediate result of a series of appends
                 or binds, before continuing the series. Such
                 examination is frequently needed, for example, to
                 control search in non-determinism monads. We present an
                 alternative approach that is just as general as CPS but
                 more robust: it makes series of binds and other such
                 operations efficient regardless of the association
                 pattern-- and also provides efficient access to
                 intermediate results. The key is to represent such a
                 conceptual sequence as an efficient sequence data
                 structure. Efficient sequence data structures from the
                 literature are homogeneous and cannot be applied as
                 they are in a type-safe way to series of monadic binds.
                 We generalize them to type aligned sequences and show
                 how to construct their (assuredly order-preserving)
                 implementations. We demonstrate that our solution
                 solves previously undocumented, severe performance
                 problems in iteratees, LogicT transformers, free monads
                 and extensible effects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '14 conference proceedings.",
}

@Article{Rajamani:2015:ART,
  author =       "Sriram Rajamani",
  title =        "Automating Repetitive Tasks for the Masses",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "1--2",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2682621",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The programming languages (PL) research community has
                 traditionally catered to the needs of professional
                 programmers in the continuously evolving technical
                 industry. However, there is a new opportunity that
                 knocks our doors. The recent IT revolution has resulted
                 in the masses having access to personal computing
                 devices. More than 99\% of these computer users are
                 non-programmers and are today limited to being passive
                 consumers of the software that is made available to
                 them. Can we empower these users to more effectively
                 leverage computers for their daily tasks? The
                 formalisms, techniques, and tools developed in the PL
                 and the formal methods research communities can play a
                 pivotal role!",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Mellies:2015:FTR,
  author =       "Paul-Andr{\'e} Melli{\`e}s and Noam Zeilberger",
  title =        "Functors are Type Refinement Systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "3--16",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676970",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The standard reading of type theory through the lens
                 of category theory is based on the idea of viewing a
                 type system as a category of well-typed terms. We
                 propose a basic revision of this reading: rather than
                 interpreting type systems as categories, we describe
                 them as functors from a category of typing derivations
                 to a category of underlying terms. Then, turning this
                 around, we explain how in fact any functor gives rise
                 to a generalized type system, with an abstract notion
                 of typing judgment, typing derivations and typing
                 rules. This leads to a purely categorical reformulation
                 of various natural classes of type systems as natural
                 classes of functors. The main purpose of this paper is
                 to describe the general framework (which can also be
                 seen as providing a categorical analysis of refinement
                 types ), and to present a few applications. As a larger
                 case study, we revisit Reynolds' paper on ``The Meaning
                 of Types'' (2000), showing how the paper's main results
                 may be reconstructed along these lines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Krishnaswami:2015:ILD,
  author =       "Neelakantan R. Krishnaswami and Pierre Pradic and Nick
                 Benton",
  title =        "Integrating Linear and Dependent Types",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "17--30",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676969",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we show how to integrate linear types
                 with type dependency, by extending the
                 linear/non-linear calculus of Benton to support type
                 dependency. Next, we give an application of this
                 calculus by giving a proof-theoretic account of
                 imperative programming, which requires extending the
                 calculus with computationally irrelevant
                 quantification, proof irrelevance, and a monad of
                 computations. We show the soundness of our theory by
                 giving a realizability model in the style of Nuprl,
                 which permits us to validate not only the beta-laws for
                 each type, but also the eta-laws. These extensions
                 permit us to decompose Hoare triples into a collection
                 of simpler type-theoretic connectives, yielding a rich
                 equational theory for dependently-typed higher-order
                 imperative programs. Furthermore, both the type theory
                 and its model are relatively simple, even when all of
                 the extensions are considered.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Sojakova:2015:HIT,
  author =       "Kristina Sojakova",
  title =        "Higher Inductive Types as Homotopy-Initial Algebras",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "31--42",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676983",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Homotopy Type Theory is a new field of mathematics
                 based on the recently-discovered correspondence between
                 Martin-L{\"o}f's constructive type theory and abstract
                 homotopy theory. We have a powerful interplay between
                 these disciplines --- we can use geometric intuition to
                 formulate new concepts in type theory and, conversely,
                 use type-theoretic machinery to verify and often
                 simplify existing mathematical proofs. Higher inductive
                 types form a crucial part of this new system since they
                 allow us to represent mathematical objects, such as
                 spheres, tori, pushouts, and quotients, in the type
                 theory. We investigate a class of higher inductive
                 types called W-suspensions which generalize
                 Martin-L{\"o}f's well-founded trees. We show that a
                 propositional variant of W-suspensions, whose
                 computational behavior is determined up to a higher
                 path, is characterized by the universal property of
                 being a homotopy-initial algebra. As a corollary we get
                 that W-suspensions in the strict form are
                 homotopy-initial.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Ngo:2015:RES,
  author =       "Minh Ngo and Fabio Massacci and Dimiter Milushev and
                 Frank Piessens",
  title =        "Runtime Enforcement of Security Policies on Black Box
                 Reactive Programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "43--54",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676978",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Security enforcement mechanisms like execution
                 monitors are used to make sure that some untrusted
                 program complies with a policy. Different enforcement
                 mechanisms have different strengths and weaknesses and
                 hence it is important to understand the qualities of
                 various enforcement mechanisms. This paper studies
                 runtime enforcement mechanisms for reactive programs.
                 We study the impact of two important constraints that
                 many practical enforcement mechanisms satisfy: (1) the
                 enforcement mechanism must handle each input/output
                 event in finite time and on occurrence of the event (as
                 opposed to for instance Ligatti's edit automata that
                 have the power to buffer events for an arbitrary amount
                 of time), and (2) the enforcement mechanism treats the
                 untrusted program as a black box: it can monitor and/or
                 edit the input/output events that the program exhibits
                 on execution and it can explore alternative executions
                 of the program by running additional copies of the
                 program and providing these different inputs. It can
                 not inspect the source or machine code of the untrusted
                 program. Such enforcement mechanisms are important in
                 practice: they include for instance many execution
                 monitors, virtual machine monitors, and secure
                 multi-execution or shadow executions. We establish
                 upper and lower bounds for the class of policies that
                 are enforceable by such black box mechanisms, and we
                 propose a generic enforcement mechanism that works for
                 a wide range of policies. We also show how our generic
                 enforcement mechanism can be instantiated to enforce
                 specific classes of policies, at the same time showing
                 that many existing enforcement mechanisms are optimized
                 instances of our construction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Barthe:2015:HOA,
  author =       "Gilles Barthe and Marco Gaboardi and Emilio Jes{\'u}s
                 Gallego Arias and Justin Hsu and Aaron Roth and
                 Pierre-Yves Strub",
  title =        "Higher-Order Approximate Relational Refinement Types
                 for Mechanism Design and Differential Privacy",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "55--68",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677000",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mechanism design is the study of algorithm design
                 where the inputs to the algorithm are controlled by
                 strategic agents, who must be incentivized to
                 faithfully report them. Unlike typical programmatic
                 properties, it is not sufficient for algorithms to
                 merely satisfy the property, incentive properties are
                 only useful if the strategic agents also believe this
                 fact. Verification is an attractive way to convince
                 agents that the incentive properties actually hold, but
                 mechanism design poses several unique challenges:
                 interesting properties can be sophisticated relational
                 properties of probabilistic computations involving
                 expected values, and mechanisms may rely on other
                 probabilistic properties, like differential privacy, to
                 achieve their goals. We introduce a relational
                 refinement type system, called HOARe2, for verifying
                 mechanism design and differential privacy. We show that
                 HOARe2 is sound w.r.t. a denotational semantics, and
                 correctly models (epsilon,delta)-differential privacy;
                 moreover, we show that it subsumes DFuzz, an existing
                 linear dependent type system for differential privacy.
                 Finally, we develop an SMT-based implementation of
                 HOARe2 and use it to verify challenging examples of
                 mechanism design, including auctions and aggregative
                 games, and new proposed examples from differential
                 privacy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Ebadi:2015:DPN,
  author =       "Hamid Ebadi and David Sands and Gerardo Schneider",
  title =        "Differential Privacy: Now it's Getting Personal",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "69--81",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677005",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Differential privacy provides a way to get useful
                 information about sensitive data without revealing much
                 about any one individual. It enjoys many nice
                 compositionality properties not shared by other
                 approaches to privacy, including, in particular,
                 robustness against side-knowledge. Designing
                 differentially private mechanisms from scratch can be a
                 challenging task. One way to make it easier to
                 construct new differential private mechanisms is to
                 design a system which allows more complex mechanisms
                 (programs) to be built from differentially private
                 building blocks in principled way, so that the
                 resulting programs are guaranteed to be differentially
                 private by construction. This paper is about a new
                 accounting principle for building differentially
                 private programs. It is based on a simple
                 generalisation of classic differential privacy which we
                 call Personalised Differential Privacy (PDP). In PDP
                 each individual has its own personal privacy level. We
                 describe ProPer, a interactive system for implementing
                 PDP which maintains a privacy budget for each
                 individual. When a primitive query is made on data
                 derived from individuals, the provenance of the
                 involved records determines how the privacy budget of
                 an individual is affected: the number of records
                 derived from Alice determines the multiplier for the
                 privacy decrease in Alice's budget. This offers some
                 advantages over previous systems, in particular its
                 fine-grained character allows better utilisation of the
                 privacy budget than mechanisms based purely on the
                 concept of global sensitivity, and it applies naturally
                 to the case of a live database where new individuals
                 are added over time. We provide a formal model of the
                 ProPer approach, prove that it provides personalised
                 differential privacy, and describe a prototype
                 implementation based on McSherry's PINQ system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Tang:2015:SBC,
  author =       "Hao Tang and Xiaoyin Wang and Lingming Zhang and Bing
                 Xie and Lu Zhang and Hong Mei",
  title =        "Summary-Based Context-Sensitive Data-Dependence
                 Analysis in Presence of Callbacks",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "83--95",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676997",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Building a summary for library code is a common
                 approach to speeding up the analysis of client code. In
                 presence of callbacks, some reachability relationships
                 between library nodes cannot be obtained during
                 library-code summarization. Thus, the library code may
                 have to be analyzed again during the analysis of the
                 client code with the library summary. In this paper, we
                 propose to summarize library code with
                 tree-adjoining-language (TAL) reachability. Compared
                 with the summary built with context-free-language (CFL)
                 reachability, the summary built with TAL reachability
                 further contains conditional reachability
                 relationships. The conditional reachability
                 relationships can lead to much lighter analysis of the
                 library code during the client code analysis with the
                 TAL-reachability-based library summary. We also
                 performed an experimental comparison of
                 context-sensitive data-dependence analysis with the
                 TAL-reachability-based library summary and
                 context-sensitive data-dependence analysis with the
                 CFL-reachability-based library summary using 15
                 benchmark subjects. Our experimental results
                 demonstrate that the former has an 8X speed-up over the
                 latter on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Chatterjee:2015:FAA,
  author =       "Krishnendu Chatterjee and Rasmus Ibsen-Jensen and
                 Andreas Pavlogiannis and Prateesh Goyal",
  title =        "Faster Algorithms for Algebraic Path Properties in
                 Recursive State Machines with Constant Treewidth",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "97--109",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676979",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Interprocedural analysis is at the heart of numerous
                 applications in programming languages, such as alias
                 analysis, constant propagation, etc. Recursive state
                 machines (RSMs) are standard models for interprocedural
                 analysis. We consider a general framework with RSMs
                 where the transitions are labeled from a semiring, and
                 path properties are algebraic with semiring operations.
                 RSMs with algebraic path properties can model
                 interprocedural dataflow analysis problems, the
                 shortest path problem, the most probable path problem,
                 etc. The traditional algorithms for interprocedural
                 analysis focus on path properties where the starting
                 point is fixed as the entry point of a specific method.
                 In this work, we consider possible multiple queries as
                 required in many applications such as in alias
                 analysis. The study of multiple queries allows us to
                 bring in a very important algorithmic distinction
                 between the resource usage of the one-time
                 preprocessing vs for each individual query. The second
                 aspect that we consider is that the control flow graphs
                 for most programs have constant treewidth. Our main
                 contributions are simple and implementable algorithms
                 that support multiple queries for algebraic path
                 properties for RSMs that have constant treewidth. Our
                 theoretical results show that our algorithms have small
                 additional one-time preprocessing, but can answer
                 subsequent queries significantly faster as compared to
                 the current best-known solutions for several important
                 problems, such as interprocedural reachability and
                 shortest path. We provide a prototype implementation
                 for interprocedural reachability and intraprocedural
                 shortest path that gives a significant speed-up on
                 several benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Raychev:2015:PPP,
  author =       "Veselin Raychev and Martin Vechev and Andreas Krause",
  title =        "Predicting Program Properties from {``Big Code''}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "111--124",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677009",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new approach for predicting program
                 properties from massive codebases (aka ``Big Code'').
                 Our approach first learns a probabilistic model from
                 existing data and then uses this model to predict
                 properties of new, unseen programs. The key idea of our
                 work is to transform the input program into a
                 representation which allows us to phrase the problem of
                 inferring program properties as structured prediction
                 in machine learning. This formulation enables us to
                 leverage powerful probabilistic graphical models such
                 as conditional random fields (CRFs) in order to perform
                 joint prediction of program properties. As an example
                 of our approach, we built a scalable prediction engine
                 called JSNice for solving two kinds of problems in the
                 context of JavaScript: predicting (syntactic) names of
                 identifiers and predicting (semantic) type annotations
                 of variables. Experimentally, JSNice predicts correct
                 names for 63\% of name identifiers and its type
                 annotation predictions are correct in 81\% of the
                 cases. In the first week since its release, JSNice was
                 used by more than 30,000 developers and in only few
                 months has become a popular tool in the JavaScript
                 developer community. By formulating the problem of
                 inferring program properties as structured prediction
                 and showing how to perform both learning and inference
                 in this context, our work opens up new possibilities
                 for attacking a wide range of difficult problems in the
                 context of ``Big Code'' including invariant generation,
                 decompilation, synthesis and others.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Alur:2015:DDL,
  author =       "Rajeev Alur and Loris D'Antoni and Mukund
                 Raghothaman",
  title =        "{DReX}: a Declarative Language for Efficiently
                 Evaluating Regular String Transformations",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "125--137",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676981",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "We present DReX, a declarative language that can
                 express all regular string-to-string transformations,
                 and can still be efficiently evaluated. The class of
                 regular string transformations has a robust theoretical
                 foundation including multiple characterizations,
                 closure properties, and decidable analysis questions,
                 and admits a number of string operations such as
                 insertion, deletion, substring swap, and reversal.
                 Recent research has led to a characterization of
                 regular string transformations using a primitive set of
                 function combinators analogous to the definition of
                 regular languages using regular expressions. While
                 these combinators form the basis for the language DReX
                 proposed in this paper, our main technical focus is on
                 the complexity of evaluating the output of a DReX
                 program on a given input string. It turns out that the
                 natural evaluation algorithm involves dynamic
                 programming, leading to complexity that is cubic in the
                 length of the input string. Our main contribution is
                 identifying a consistency restriction on the use of
                 combinators in DReX programs, and a single-pass
                 evaluation algorithm for consistent programs with time
                 complexity that is linear in the length of the input
                 string and polynomial in the size of the program. We
                 show that the consistency restriction does not limit
                 the expressiveness, and whether a DReX program is
                 consistent can be checked efficiently. We report on a
                 prototype implementation, and evaluate it using a
                 representative set of text processing tasks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Veanes:2015:DPS,
  author =       "Margus Veanes and Todd Mytkowicz and David Molnar and
                 Benjamin Livshits",
  title =        "Data-Parallel String-Manipulating Programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "139--152",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677014",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "String-manipulating programs are an important class of
                 programs with applications in malware detection,
                 graphics, input sanitization for Web security, and
                 large-scale HTML processing. This paper extends prior
                 work on BEK, an expressive domain-specific language for
                 writing string-manipulating programs, with algorithmic
                 insights that make BEK both analyzable and
                 data-parallel. By analyzable we mean that unlike most
                 general purpose programming languages, many algebraic
                 properties of a BEK program are decidable (i.e., one
                 can check whether two programs commute or compute the
                 inverse of a program). By data-parallel we mean that a
                 BEK program can compute on arbitrary subsections of its
                 input in parallel, thus exploiting parallel hardware.
                 This latter requirement is particularly important for
                 programs which operate on large data: without data
                 parallelism, a programmer cannot hide the latency of
                 reading data from various storage media (i.e., reading
                 a terabyte of data from a modern hard drive takes about
                 3 hours). With a data-parallel approach, the system can
                 split data across multiple disks and thus hide the
                 latency of reading the data. A BEK program is
                 expressive: a programmer can use conditionals, switch
                 statements, and registers --- or local variables --- in
                 order to implement common string-manipulating programs.
                 Unfortunately, this expressivity induces data
                 dependencies, which are an obstacle to parallelism. The
                 key contribution of this paper is an algorithm which
                 automatically removes these data dependencies by
                 mapping a BEK program into a intermediate format
                 consisting of symbolic transducers, which extend
                 classical transducers with symbolic predicates and
                 symbolic assignments. We present a novel algorithm that
                 we call exploration which performs symbolic loop
                 unrolling of these transducers to obtain simplified
                 versions of the original program. We show how these
                 simplified versions can then be lifted to a stateless
                 form, and from there compiled to data-parallel
                 hardware. To evaluate the efficacy of our approach, we
                 demonstrate up to 8x speedups for a number of
                 real-world, BEK programs, (e.g., HTML encoder and
                 decoder) on data-parallel hardware. To the best of our
                 knowledge, these are the first data parallel
                 implementation of these programs. To validate that our
                 approach is correct, we use an automatic testing
                 technique to compare our generated code to the original
                 implementations and find no semantic deviations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Chlipala:2015:UWS,
  author =       "Adam Chlipala",
  title =        "{Ur\slash Web}: a Simple Model for Programming the
                 {Web}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "153--165",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677004",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The World Wide Web has evolved gradually from a
                 document delivery platform to an architecture for
                 distributed programming. This largely unplanned
                 evolution is apparent in the set of interconnected
                 languages and protocols that any Web application must
                 manage. This paper presents Ur/Web, a domain-specific,
                 statically typed functional programming language with a
                 much simpler model for programming modern Web
                 applications. Ur/Web's model is unified, where programs
                 in a single programming language are compiled to other
                 ``Web standards'' languages as needed; supports novel
                 kinds of encapsulation of Web-specific state; and
                 exposes simple concurrency, where programmers can
                 reason about distributed, multithreaded applications
                 via a mix of transactions and cooperative preemption.
                 We give a tutorial introduction to the main features of
                 Ur/Web and discuss the language implementation and the
                 production Web applications that use it.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Rastogi:2015:SEG,
  author =       "Aseem Rastogi and Nikhil Swamy and C{\'e}dric Fournet
                 and Gavin Bierman and Panagiotis Vekris",
  title =        "Safe \& Efficient Gradual Typing for {TypeScript}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "167--180",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676971",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Current proposals for adding gradual typing to
                 JavaScript, such as Closure, TypeScript and Dart, forgo
                 soundness to deal with issues of scale, code reuse, and
                 popular programming patterns. We show how to address
                 these issues in practice while retaining soundness. We
                 design and implement a new gradual type system,
                 prototyped for expediency as a 'Safe' compilation mode
                 for TypeScript. Our compiler achieves soundness by
                 enforcing stricter static checks and embedding residual
                 runtime checks in compiled code. It emits plain
                 JavaScript that runs on stock virtual machines. Our
                 main theorem is a simulation that ensures that the
                 checks introduced by Safe TypeScript (1) catch any
                 dynamic type error, and (2) do not alter the semantics
                 of type-safe TypeScript code. Safe TypeScript is
                 carefully designed to minimize the performance overhead
                 of runtime checks. At its core, we rely on two new
                 ideas: differential subtyping, a new form of coercive
                 subtyping that computes the minimum amount of runtime
                 type information that must be added to each object; and
                 an erasure modality, which we use to safely and
                 selectively erase type information. This allows us to
                 scale our design to full-fledged TypeScript, including
                 arrays, maps, classes, inheritance, overloading, and
                 generic types. We validate the usability and
                 performance of Safe TypeScript empirically by
                 type-checking and compiling around 120,000 lines of
                 existing TypeScript source code. Although runtime
                 checks can be expensive, the end-to-end overhead is
                 small for code bases that already have type
                 annotations. For instance, we bootstrap the Safe
                 TypeScript compiler (90,000 lines including the base
                 TypeScript compiler): we measure a 15\% runtime
                 overhead for type safety, and also uncover programming
                 errors as type safety violations. We conclude that, at
                 least during development and testing, subjecting
                 JavaScript/TypeScript programs to safe gradual typing
                 adds significant value to source type annotations at a
                 modest cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Greenberg:2015:SEM,
  author =       "Michael Greenberg",
  title =        "Space-Efficient Manifest Contracts",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "181--194",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676967",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The standard algorithm for higher-order contract
                 checking can lead to unbounded space consumption and
                 can destroy tail recursion, altering a program's
                 asymptotic space complexity. While space efficiency for
                 gradual types---contracts mediating untyped and typed
                 code---is well studied, sound space efficiency for
                 manifest contracts---contracts that check stronger
                 properties than simple types, e.g., ``is a natural''
                 instead of ''is an integer''---remains an open problem.
                 We show how to achieve sound space efficiency for
                 manifest contracts with strong predicate contracts. The
                 essential trick is breaking the contract checking down
                 into coercions: structured, blame-annotated lists of
                 checks. By carefully preventing duplicate coercions
                 from appearing, we can restore space efficiency while
                 keeping the same observable behavior.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Sekiyama:2015:MCD,
  author =       "Taro Sekiyama and Yuki Nishida and Atsushi Igarashi",
  title =        "Manifest Contracts for Datatypes",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "195--207",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676996",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study algebraic data types in a manifest contract
                 system, a software contract system where contract
                 information occurs as refinement types. We first
                 compare two simple approaches: refinements on type
                 constructors and refinements on data constructors. For
                 example, lists of positive integers can be described by
                 {l:int list | for_all (lambda y. y > 0) l} in the
                 former, whereas by a user-defined datatype pos_list
                 with cons of type {x:int | x > 0} X pos_list->pos_list
                 in the latter. The two approaches are complementary:
                 the former makes it easier for a programmer to write
                 types and the latter enables more efficient contract
                 checking. To take the best of both worlds, we propose
                 (1) a syntactic translation from refinements on type
                 constructors to equivalent refinements on data
                 constructors and (2) dynamically checked casts between
                 different but compatible datatypes such as int list and
                 pos_list. We define a manifest contract calculus to
                 formalize the semantics of the casts and prove that the
                 translation is correct.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Vafeiadis:2015:CCO,
  author =       "Viktor Vafeiadis and Thibaut Balabonski and Soham
                 Chakraborty and Robin Morisset and Francesco Zappa
                 Nardelli",
  title =        "Common Compiler Optimisations are Invalid in the {C11}
                 Memory Model and what we can do about it",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "209--220",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676995",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We show that the weak memory model introduced by the
                 2011 C and C++ standards does not permit many common
                 source-to-source program transformations (such as
                 expression linearisation and ``roach motel''
                 reorderings) that modern compilers perform and that are
                 deemed to be correct. As such it cannot be used to
                 define the semantics of intermediate languages of
                 compilers, as, for instance, LLVM aimed to. We consider
                 a number of possible local fixes, some strengthening
                 and some weakening the model. We evaluate the proposed
                 fixes by determining which program transformations are
                 valid with respect to each of the patched models. We
                 provide formal Coq proofs of their correctness or
                 counterexamples as appropriate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Lange:2015:CMG,
  author =       "Julien Lange and Emilio Tuosto and Nobuko Yoshida",
  title =        "From Communicating Machines to Graphical
                 Choreographies",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "221--232",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676964",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphical choreographies, or global graphs, are
                 general multiparty session specifications featuring
                 expressive constructs such as forking, merging, and
                 joining for representing application-level protocols.
                 Global graphs can be directly translated into modelling
                 notations such as BPMN and UML. This paper presents an
                 algorithm whereby a global graph can be constructed
                 from asynchronous interactions represented by
                 communicating finite-state machines (CFSMs). Our
                 results include: a sound and complete characterisation
                 of a subset of safe CFSMs from which global graphs can
                 be constructed; an algorithm to translate CFSMs to
                 global graphs; a time complexity analysis; and an
                 implementation of our theory, as well as an
                 experimental evaluation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Dodds:2015:SCT,
  author =       "Mike Dodds and Andreas Haas and Christoph M. Kirsch",
  title =        "A Scalable, Correct Time-Stamped Stack",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "233--246",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676963",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent data-structures, such as stacks, queues,
                 and deques, often implicitly enforce a total order over
                 elements in their underlying memory layout. However,
                 much of this order is unnecessary: linearizability only
                 requires that elements are ordered if the insert
                 methods ran in sequence. We propose a new approach
                 which uses timestamping to avoid unnecessary ordering.
                 Pairs of elements can be left unordered if their
                 associated insert operations ran concurrently, and
                 order imposed as necessary at the eventual removal. We
                 realise our approach in a new non-blocking
                 data-structure, the TS (timestamped) stack. Using the
                 same approach, we can define corresponding queue and
                 deque data-structures. In experiments on x86, the TS
                 stack outperforms and outscales all its competitors ---
                 for example, it outperforms the elimination-backoff
                 stack by factor of two. In our approach, more
                 concurrency translates into less ordering, giving
                 less-contended removal and thus higher performance and
                 scalability. Despite this, the TS stack is linearizable
                 with respect to stack semantics. The weak internal
                 ordering in the TS stack presents a challenge when
                 establishing linearizability: standard techniques such
                 as linearization points work well when there exists a
                 total internal order. We present a new stack theorem,
                 mechanised in Isabelle, which characterises the
                 orderings sufficient to establish stack semantics. By
                 applying our stack theorem, we show that the TS stack
                 is indeed linearizable. Our theorem constitutes a new,
                 generic proof technique for concurrent stacks, and it
                 paves the way for future weakly ordered data-structure
                 designs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Jourdan:2015:FVC,
  author =       "Jacques-Henri Jourdan and Vincent Laporte and Sandrine
                 Blazy and Xavier Leroy and David Pichardie",
  title =        "A Formally-Verified {C} Static Analyzer",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "247--259",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676966",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper reports on the design and soundness proof,
                 using the Coq proof assistant, of Verasco, a static
                 analyzer based on abstract interpretation for most of
                 the ISO C 1999 language (excluding recursion and
                 dynamic allocation). Verasco establishes the absence of
                 run-time errors in the analyzed programs. It enjoys a
                 modular architecture that supports the extensible
                 combination of multiple abstract domains, both
                 relational and non-relational. Verasco integrates with
                 the CompCert formally-verified C compiler so that not
                 only the soundness of the analysis results is
                 guaranteed with mathematical certitude, but also the
                 fact that these guarantees carry over to the compiled
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Giacobazzi:2015:APA,
  author =       "Roberto Giacobazzi and Francesco Logozzo and Francesco
                 Ranzato",
  title =        "Analyzing Program Analyses",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "261--273",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676987",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We want to prove that a static analysis of a given
                 program is complete, namely, no imprecision arises when
                 asking some query on the program behavior in the
                 concrete (ie, for its concrete semantics) or in the
                 abstract (ie, for its abstract interpretation).
                 Completeness proofs are therefore useful to assign
                 confidence to alarms raised by static analyses. We
                 introduce the completeness class of an abstraction as
                 the set of all programs for which the abstraction is
                 complete. Our first result shows that for any
                 nontrivial abstraction, its completeness class is not
                 recursively enumerable. We then introduce a stratified
                 deductive system to prove the completeness of program
                 analyses over an abstract domain A. We prove the
                 soundness of the deductive system. We observe that the
                 only sources of incompleteness are assignments and
                 Boolean tests --- unlikely a common belief in static
                 analysis, joins do not induce incompleteness. The first
                 layer of this proof system is generic,
                 abstraction-agnostic, and it deals with the standard
                 constructs for program composition, that is, sequential
                 composition, branching and guarded iteration. The
                 second layer is instead abstraction-specific: the
                 designer of an abstract domain A provides conditions
                 for completeness in A of assignments and Boolean tests
                 which have to be checked by a suitable static analysis
                 or assumed in the completeness proof as hypotheses. We
                 instantiate the second layer of this proof system first
                 with a generic nonrelational abstraction in order to
                 provide a sound rule for the completeness of
                 assignments. Orthogonally, we instantiate it to the
                 numerical abstract domains of Intervals and Octagons,
                 providing necessary and sufficient conditions for the
                 completeness of their Boolean tests and of assignments
                 for Octagons.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Stewart:2015:CC,
  author =       "Gordon Stewart and Lennart Beringer and Santiago
                 Cuellar and Andrew W. Appel",
  title =        "Compositional {CompCert}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "275--287",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676985",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper reports on the development of Compositional
                 CompCert, the first verified separate compiler for C.
                 Specifying and proving separate compilation for C is
                 made challenging by the coincidence of: compiler
                 optimizations, such as register spilling, that
                 introduce compiler-managed (private) memory regions
                 into function stack frames, and C's stack-allocated
                 addressable local variables, which may leak portions of
                 stack frames to other modules when their addresses are
                 passed as arguments to external function calls. The
                 CompCert compiler, as built/proved by Leroy etal
                 2006--2014, has proofs of correctness for whole
                 programs, but its simulation relations are too weak to
                 specify or prove separately compiled modules. Our
                 technical contributions that make Compositional
                 CompCert possible include: language-independent
                 linking, a new operational model of multilanguage
                 linking that supports strong semantic contextual
                 equivalences; and structured simulations, a refinement
                 of Beringer etal logical simulation relations that
                 enables expressive module-local invariants on the state
                 communicated between compilation units at runtime. All
                 the results in the paper have been formalized in Coq
                 and are available for download together with the
                 Compositional CompCert compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Castagna:2015:PFS,
  author =       "Giuseppe Castagna and Kim Nguyen and Zhiwu Xu and
                 Pietro Abate",
  title =        "Polymorphic Functions with Set-Theoretic Types: {Part
                 2}: Local Type Inference and Type Reconstruction",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "289--302",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676991",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This article is the second part of a two articles
                 series about the definition of higher order polymorphic
                 functions in a type system with recursive types and
                 set-theoretic type connectives (unions, intersections,
                 and negations). In the first part, presented in a
                 companion paper, we defined and studied the syntax,
                 semantics, and evaluation of the explicitly-typed
                 version of a calculus, in which type instantiation is
                 driven by explicit instantiation annotations. In this
                 second part we present a local type inference system
                 that allows the programmer to omit explicit
                 instantiation annotations for function applications,
                 and a type reconstruction system that allows the
                 programmer to omit explicit type annotations for
                 function definitions. The work presented in the two
                 articles provides the theoretical foundations and
                 technical machinery needed to design and implement
                 higher-order polymorphic functional languages with
                 union and intersection types and/or for semi-structured
                 data processing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Garcia:2015:PTS,
  author =       "Ronald Garcia and Matteo Cimini",
  title =        "Principal Type Schemes for Gradual Programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "303--315",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676992",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Gradual typing is a discipline for integrating dynamic
                 checking into a static type system. Since its
                 introduction in functional languages, it has been
                 adapted to a variety of type systems, including
                 object-oriented, security, and substructural. This work
                 studies its application to implicitly typed languages
                 based on type inference. Siek and Vachharajani designed
                 a gradual type inference system and algorithm that
                 infers gradual types but still rejects ill-typed static
                 programs. However, the type system requires local
                 reasoning about type substitutions, an imperative
                 inference algorithm, and a subtle correctness
                 statement. This paper introduces a new approach to
                 gradual type inference, driven by the principle that
                 gradual inference should only produce static types. We
                 present a static implicitly typed language, its gradual
                 counterpart, and a type inference procedure. The
                 gradual system types the same programs as Siek and
                 Vachharajani, but has a modular structure amenable to
                 extension. The language admits let-polymorphism, and
                 its dynamics are defined by translation to the
                 Polymorphic Blame Calculus. The principal types
                 produced by our initial type system mask the
                 distinction between static parametric polymorphism and
                 polymorphism that can be attributed to gradual typing.
                 To expose this difference, we distinguish static type
                 parameters from gradual type parameters and reinterpret
                 gradual type consistency accordingly. The resulting
                 extension enables programs to be interpreted using
                 either the polymorphic or monomorphic Blame Calculi.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Lourenco:2015:DIF,
  author =       "Lu{\'\i}sa Louren{\c{c}}o and Lu{\'\i}s Caires",
  title =        "Dependent Information Flow Types",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "317--328",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676994",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we develop a novel notion of dependent
                 information flow types. Dependent information flow
                 types fit within the standard framework of dependent
                 type theory, but, unlike usual dependent types,
                 crucially allow the security level of a type, rather
                 than just the structural data type itself, to depend on
                 runtime values. Our dependent function and dependent
                 sum information flow types provide a direct, natural
                 and elegant way to express and enforce fine grained
                 security policies on programs, including programs that
                 manipulate structured data types in which the security
                 level of a structure field may depend on values
                 dynamically stored in other fields, still considered a
                 challenge to security enforcement in software systems
                 such as data-centric web-based applications. We base
                 our development on the very general setting of a
                 minimal lambda-calculus with references and
                 collections. We illustrate its expressiveness, showing
                 how secure operations on relevant scenarios can be
                 modelled and analysed using our dependent information
                 flow type system, which is also shown to be amenable to
                 algorithmic type checking. Our main results include
                 type-safety and non-interference theorems ensuring that
                 well-typed programs do not violate prescribed security
                 policies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Preda:2015:ASA,
  author =       "Mila Dalla Preda and Roberto Giacobazzi and Arun
                 Lakhotia and Isabella Mastroeni",
  title =        "Abstract Symbolic Automata: Mixed syntactic\slash
                 semantic similarity analysis of executables",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "329--341",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676986",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a model for mixed syntactic/semantic
                 approximation of programs based on symbolic finite
                 automata (SFA). The edges of SFA are labeled by
                 predicates whose semantics specifies the denotations
                 that are allowed by the edge. We introduce the notion
                 of abstract symbolic finite automaton (ASFA) where
                 approximation is made by abstract interpretation of
                 symbolic finite automata, acting both at syntactic
                 (predicate) and semantic (denotation) level. We
                 investigate in the details how the syntactic and
                 semantic abstractions of SFA relate to each other and
                 contribute to the determination of the recognized
                 language. Then we introduce a family of transformations
                 for simplifying ASFA. We apply this model to prove
                 properties of commonly used tools for similarity
                 analysis of binary executables. Following the structure
                 of their control flow graphs, disassembled binary
                 executables are represented as (concrete) SFA, where
                 states are program points and predicates represent the
                 (possibly infinite) I/O semantics of each basic block
                 in a constraint form. Known tools for binary code
                 analysis are viewed as specific choices of symbolic and
                 semantic abstractions in our framework, making symbolic
                 finite automata and their abstract interpretations a
                 unifying model for comparing and reasoning about
                 soundness and completeness of analyses of low-level
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Foster:2015:CDP,
  author =       "Nate Foster and Dexter Kozen and Matthew Milano and
                 Alexandra Silva and Laure Thompson",
  title =        "A Coalgebraic Decision Procedure for {NetKAT}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "343--355",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677011",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "NetKAT is a domain-specific language and logic for
                 specifying and verifying network packet-processing
                 functions. It consists of Kleene algebra with tests
                 (KAT) augmented with primitives for testing and
                 modifying packet headers and encoding network
                 topologies. Previous work developed the design of the
                 language and its standard semantics, proved the
                 soundness and completeness of the logic, defined a
                 PSPACE algorithm for deciding equivalence, and
                 presented several practical applications. This paper
                 develops the coalgebraic theory of NetKAT, including a
                 specialized version of the Brzozowski derivative, and
                 presents a new efficient algorithm for deciding the
                 equational theory using bisimulation. The coalgebraic
                 structure admits an efficient sparse representation
                 that results in a significant reduction in the size of
                 the state space. We discuss the details of our
                 implementation and optimizations that exploit NetKAT's
                 equational axioms and coalgebraic structure to yield
                 significantly improved performance. We present results
                 from experiments demonstrating that our tool is
                 competitive with state-of-the-art tools on several
                 benchmarks including all-pairs connectivity,
                 loop-freedom, and translation validation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Pous:2015:SAL,
  author =       "Damien Pous",
  title =        "Symbolic Algorithms for Language Equivalence and
                 {Kleene} Algebra with Tests",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "357--368",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677007",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose algorithms for checking language
                 equivalence of finite automata over a large alphabet.
                 We use symbolic automata, where the transition function
                 is compactly represented using (multi-terminal) binary
                 decision diagrams (BDD). The key idea consists in
                 computing a bisimulation by exploring reachable pairs
                 symbolically, so as to avoid redundancies. This idea
                 can be combined with already existing optimisations,
                 and we show in particular a nice integration with the
                 disjoint sets forest data-structure from Hopcroft and
                 Karp's standard algorithm. Then we consider Kleene
                 algebra with tests (KAT), an algebraic theory that can
                 be used for verification in various domains ranging
                 from compiler optimisation to network programming
                 analysis. This theory is decidable by reduction to
                 language equivalence of automata on guarded strings, a
                 particular kind of automata that have exponentially
                 large alphabets. We propose several methods allowing to
                 construct symbolic automata out of KAT expressions,
                 based either on Brzozowski's derivatives or on standard
                 automata constructions. All in all, this results in
                 efficient algorithms for deciding equivalence of KAT
                 expressions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Sjoberg:2015:PC,
  author =       "Vilhelm Sj{\"o}berg and Stephanie Weirich",
  title =        "Programming up to Congruence",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "369--382",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676974",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents the design of Zombie, a
                 dependently-typed programming language that uses an
                 adaptation of a congruence closure algorithm for proof
                 and type inference. This algorithm allows the type
                 checker to automatically use equality assumptions from
                 the context when reasoning about equality. Most
                 dependently-typed languages automatically use
                 equalities that follow from beta-reduction during type
                 checking; however, such reasoning is incompatible with
                 congruence closure. In contrast, Zombie does not use
                 automatic beta-reduction because types may contain
                 potentially diverging terms. Therefore Zombie provides
                 a unique opportunity to explore an alternative
                 definition of equivalence in dependently-typed language
                 design. Our work includes the specification of the
                 language via a bidirectional type system, which works
                 `up-to-congruence,' and an algorithm for elaborating
                 expressions in this language to an explicitly typed
                 core language. We prove that our elaboration algorithm
                 is complete with respect to the source type system, and
                 always produces well typed terms in the core language.
                 This algorithm has been implemented in the Zombie
                 language, which includes general recursion, irrelevant
                 arguments, heterogeneous equality and datatypes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Tobisawa:2015:MLC,
  author =       "Kazunori Tobisawa",
  title =        "A Meta Lambda Calculus with Cross-Level Computation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "383--393",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676976",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose meta lambda calculus Lambda-* as a basic
                 model of textual substitution via metavariables. The
                 most important feature of the calculus is that every
                 beta-redex can be reduced regardless of whether the
                 beta-redex contains meta-level variables or not. Such a
                 meta lambda calculus has never been achieved before due
                 to difficulty to manage binding structure consistently
                 with alpha-renaming in the presence of meta-level
                 variables. We overcome the difficulty by introducing a
                 new mechanism to deal with substitution and binding
                 structure in a systematic way without the notion of
                 free variables and alpha-renaming. Calculus Lambda-*
                 enables us to investigate cross-level terms that
                 include a certain type of level mismatch. Cross-level
                 terms have been regarded as meaningless terms and left
                 out of consideration thus far. We find that some
                 cross-level terms behave as quotes and `eval' command
                 in programming languages. With these terms, we show a
                 procedural language as an application of the calculus,
                 which sheds new light on the notions of stores and
                 recursion via meta-level variables.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Staton:2015:AEL,
  author =       "Sam Staton",
  title =        "Algebraic Effects, Linearity, and Quantum Programming
                 Languages",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "395--406",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676999",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We develop a new framework of algebraic theories with
                 linear parameters, and use it to analyze the equational
                 reasoning principles of quantum computing and quantum
                 programming languages. We use the framework as follows:
                 we present a new elementary algebraic theory of quantum
                 computation, built from unitary gates and measurement;
                 we provide a completeness theorem or the elementary
                 algebraic theory by relating it with a model from
                 operator algebra; we extract an equational theory for a
                 quantum programming language from the algebraic theory;
                 we compare quantum computation with other local notions
                 of computation by investigating variations on the
                 algebraic theory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Farzan:2015:PSU,
  author =       "Azadeh Farzan and Zachary Kincaid and Andreas
                 Podelski",
  title =        "Proof Spaces for Unbounded Parallelism",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "407--420",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677012",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we present a new approach to
                 automatically verify multi-threaded programs which are
                 executed by an unbounded number of threads running in
                 parallel. The starting point for our work is the
                 problem of how we can leverage existing automated
                 verification technology for sequential programs
                 (abstract interpretation, Craig interpolation,
                 constraint solving, etc.) for multi-threaded programs.
                 Suppose that we are given a correctness proof for a
                 trace of a program (or for some other program
                 fragment). We observe that the proof can always be
                 decomposed into a finite set of Hoare triples, and we
                 ask what can be proved from the finite set of Hoare
                 triples using only simple combinatorial inference rules
                 (without access to a theorem prover and without the
                 possibility to infer genuinely new Hoare triples)? We
                 introduce a proof system where one proves the
                 correctness of a multi-threaded program by showing that
                 for each trace of the program, there exists a
                 correctness proof in the space of proofs that are
                 derivable from a finite set of axioms using simple
                 combinatorial inference rules. This proof system is
                 complete with respect to the classical proof method of
                 establishing an inductive invariant (which uses thread
                 quantification and control predicates). Moreover, it is
                 possible to algorithmically check whether a given set
                 of axioms is sufficient to prove the correctness of a
                 multi-threaded program, using ideas from
                 well-structured transition systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Sangiorgi:2015:ECU,
  author =       "Davide Sangiorgi",
  title =        "Equations, Contractions, and Unique Solutions",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "421--432",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676965",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "One of the most studied behavioural equivalences is
                 bisimilarity. Its success is much due to the associated
                 bisimulation proof method, which can be further
                 enhanced by means of ``up-to bisimulation'' techniques
                 such as ``up-to context''. A different proof method is
                 discussed, based on unique solution of special forms of
                 inequations called contractions, and inspired by
                 Milner's theorem on unique solution of equations. The
                 method is as powerful as the bisimulation proof method
                 and its ``up-to context'' enhancements. The definition
                 of contraction can be transferred onto other
                 behavioural equivalences, possibly contextual and
                 noncoinductive. This enables a coinductive reasoning
                 style on such equivalences, either by applying the
                 method based on unique solution of contractions, or by
                 injecting appropriate contraction preorders into the
                 bisimulation game. The techniques are illustrated on
                 CCS-like languages; an example dealing with
                 higher-order languages is also shown.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Gupta:2015:SRC,
  author =       "Ashutosh Gupta and Thomas A. Henzinger and Arjun
                 Radhakrishna and Roopsha Samanta and Thorsten Tarrach",
  title =        "Succinct Representation of Concurrent Trace Sets",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "433--444",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677008",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a method and a tool for generating succinct
                 representations of sets of concurrent traces. We focus
                 on trace sets that contain all correct or all incorrect
                 permutations of events from a given trace. We represent
                 trace sets as HB-Formulas that are Boolean combinations
                 of happens-before constraints between events. To
                 generate a representation of incorrect interleavings,
                 our method iteratively explores interleavings that
                 violate the specification and gathers generalizations
                 of the discovered interleavings into an HB-Formula; its
                 complement yields a representation of correct
                 interleavings. We claim that our trace set
                 representations can drive diverse verification, fault
                 localization, repair, and synthesis techniques for
                 concurrent programs. We demonstrate this by using our
                 tool in three case studies involving synchronization
                 synthesis, bug summarization, and abstraction
                 refinement based verification. In each case study, our
                 initial experimental results have been promising. In
                 the first case study, we present an algorithm for
                 inferring missing synchronization from an HB-Formula
                 representing correct interleavings of a given trace.
                 The algorithm applies rules to rewrite specific
                 patterns in the HB-Formula into locks, barriers, and
                 wait-notify constructs. In the second case study, we
                 use an HB-Formula representing incorrect interleavings
                 for bug summarization. While the HB-Formula itself is a
                 concise counterexample summary, we present additional
                 inference rules to help identify specific concurrency
                 bugs such as data races, define-use order violations,
                 and two-stage access bugs. In the final case study, we
                 present a novel predicate learning procedure that uses
                 HB-Formulas representing abstract counterexamples to
                 accelerate counterexample-guided abstraction refinement
                 (CEGAR). In each iteration of the CEGAR loop, the
                 procedure refines the abstraction to eliminate multiple
                 spurious abstract counterexamples drawn from the
                 HB-Formula.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Bogdanas:2015:KJC,
  author =       "Denis Bogdanas and Grigore Rosu",
  title =        "{K-Java}: a Complete Semantics of {Java}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "445--456",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676982",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents K-Java, a complete executable
                 formal semantics of Java 1.4. K-Java was extensively
                 tested with a test suite developed alongside the
                 project, following the Test Driven Development
                 methodology. In order to maintain clarity while
                 handling the great size of Java, the semantics was
                 split into two separate definitions --- a static
                 semantics and a dynamic semantics. The output of the
                 static semantics is a preprocessed Java program, which
                 is passed as input to the dynamic semantics for
                 execution. The preprocessed program is a valid Java
                 program, which uses a subset of the features of Java.
                 The semantics is applied to model-check multi-threaded
                 programs. Both the test suite and the static semantics
                 are generic and ready to be used in other Java-related
                 projects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Adams:2015:TEH,
  author =       "Michael D. Adams",
  title =        "Towards the Essence of Hygiene",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "457--469",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677013",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hygiene is an essential aspect of Scheme's macro
                 system that prevents unintended variable capture.
                 However, previous work on hygiene has focused on
                 algorithmic implementation rather than precise,
                 mathematical definition of what constitutes hygiene.
                 This is in stark contrast with lexical scope,
                 alpha-equivalence and capture-avoiding substitution,
                 which also deal with preventing unintended variable
                 capture but have widely applicable and well-understood
                 mathematical definitions. This paper presents such a
                 precise, mathematical definition of hygiene. It reviews
                 various kinds of hygiene violation and presents
                 examples of how they occur. From these examples, we
                 develop a practical algorithm for hygienic macro
                 expansion. We then present algorithm-independent,
                 mathematical criteria for whether a macro expansion
                 algorithm is hygienic. This characterization
                 corresponds closely to existing hygiene algorithms and
                 sheds light on aspects of hygiene that are usually
                 overlooked in informal definitions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Brown:2015:SRG,
  author =       "Matt Brown and Jens Palsberg",
  title =        "Self-Representation in {Girard}'s {System U}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "471--484",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676988",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In 1991, Pfenning and Lee studied whether System F
                 could support a typed self-interpreter. They concluded
                 that typed self-representation for System F ``seems to
                 be impossible'', but were able to represent System F in
                 F$_{ \omega }$. Further, they found that the
                 representation of F$_{ \omega }$ requires kind
                 polymorphism, which is outside F$_{ \omega }$. In 2009,
                 Rendel, Ostermann and Hofer conjectured that the
                 representation of kind-polymorphic terms would require
                 another, higher form of polymorphism. Is this a case of
                 infinite regress? We show that it is not and present a
                 typed self-representation for Girard's System U, the
                 first for a \lambda -calculus with decidable type
                 checking. System U extends System F$_{ \omega }$ with
                 kind polymorphic terms and types. We show that kind
                 polymorphic types (i.e. types that depend on kinds) are
                 sufficient to ``tie the knot'' --- they enable
                 representations of kind polymorphic terms without
                 introducing another form of polymorphism. Our
                 self-representation supports operations that iterate
                 over a term, each of which can be applied to a
                 representation of itself. We present three typed
                 self-applicable operations: a self-interpreter that
                 recovers a term from its representation, a predicate
                 that tests the intensional structure of a term, and a
                 typed continuation-passing-style (CPS) transformation
                 --- the first typed self-applicable CPS transformation.
                 Our techniques could have applications from verifiably
                 type-preserving metaprograms, to growable typed
                 languages, to more efficient self-interpreters.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Lee:2015:CEE,
  author =       "Peter Lee",
  title =        "Coding by Everyone, Every Day",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "485--485",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2682622",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In recent years, advances in machine learning and
                 related fields have led to significant advances in a
                 range of user-interface technologies, including audio
                 processing, speech recognition, and natural language
                 processing. These advances in turn have enabled
                 speech-based digital assistants and speech-to-speech
                 translation systems to become practical to deploy on a
                 large scale. In essence, machines are becoming capable
                 of hearing what we are saying. But will they understand
                 what we want them to do when we talk to them? What are
                 the prospects for getting useful work done in essence,
                 by synthesizing programs --- through the act of having
                 a conversation with a computer? In this lecture, I will
                 speculate on the central role that programming-language
                 design and program synthesis may have in this possible
                 --- and I will argue, likely --- future of computing,
                 one in which every user writes programs, every day, by
                 conversing with a computing system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Buneman:2015:DPT,
  author =       "Peter Buneman",
  title =        "Databases and Programming: Two Subjects Divided by a
                 Common Language?",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "487--487",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2682620",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The 1990s saw a hugely productive interaction between
                 database and programming language research. Ideas about
                 type systems from programming languages played a
                 central role in generalizing and adapting relational
                 database systems to new data models. At the same time
                 databases provided some of the best concrete examples
                 of the application of concurrency theory and of the
                 benefits of high-level optimization in functional
                 programming languages. One of the driving ambitions
                 behind this research was the idea that database access
                 should be properly embedded in programming languages:
                 one should not have to be bilingual in order to use a
                 database from a programming language; and that goal has
                 to some extent been realized. In the past fifteen
                 years, new data models, both for data storage and for
                 data exchange have appeared with depressing regularity
                 and with each such model, the inevitable query
                 language. Does programming language research have
                 anything to contribute to these new languages? Should
                 we take the time to to worry about embedding these
                 models in conventional languages? Over the same period,
                 some interesting new connections between databases and
                 programming languages have emerged, notably in the
                 areas of scientific databases, annotation and
                 provenance. Will this provide new opportunities for
                 cross-fertilization?",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Fioriti:2015:PTS,
  author =       "Luis Mar{\'\i}a Ferrer Fioriti and Holger Hermanns",
  title =        "Probabilistic Termination: Soundness, Completeness,
                 and Compositionality",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "489--501",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677001",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a framework to prove almost sure
                 termination for probabilistic programs with real valued
                 variables. It is based on ranking supermartingales, a
                 notion analogous to ranking functions on
                 non-probabilistic programs. The framework is proven
                 sound and complete for a meaningful class of programs
                 involving randomization and bounded nondeterminism. We
                 complement this foundational insight by a practical
                 proof methodology, based on sound conditions that
                 enable compositional reasoning and are amenable to a
                 direct implementation using modern theorem provers.
                 This is integrated in a small dependent type system, to
                 overcome the problem that lexicographic ranking
                 functions fail when combined with randomization. Among
                 others, this compositional methodology enables the
                 verification of probabilistic programs outside the
                 complete class that admits ranking supermartingales.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{He:2015:LWA,
  author =       "Fei He and Xiaowei Gao and Bow-Yaw Wang and Lijun
                 Zhang",
  title =        "Leveraging Weighted Automata in Compositional
                 Reasoning about Concurrent Probabilistic Systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "503--514",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676998",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose the first sound and complete learning-based
                 compositional verification technique for probabilistic
                 safety properties on concurrent systems where each
                 component is an Markov decision process. Different from
                 previous works, weighted assumptions are introduced to
                 attain completeness of our framework. Since weighted
                 assumptions can be implicitly represented by
                 multi-terminal binary decision diagrams (MTBDD's), we
                 give an L*-based learning algorithm for MTBDD's to
                 infer weighted assumptions. Experimental results
                 suggest promising outlooks for our compositional
                 technique.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Bonchi:2015:FAS,
  author =       "Filippo Bonchi and Pawel Sobocinski and Fabio Zanasi",
  title =        "Full Abstraction for Signal Flow Graphs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "515--526",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676993",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Network theory uses the string diagrammatic language
                 of monoidal categories to study graphical structures
                 formally, eschewing specialised translations into
                 intermediate formalisms. Recently, there has been a
                 concerted research focus on developing a network
                 theoretic approach to signal flow graphs, which are
                 classical structures in control theory, signal
                 processing and a cornerstone in the study of feedback.
                 In this approach, signal flow graphs are given a
                 relational denotational semantics in terms of formal
                 power series. Thus far, the operational behaviour of
                 such signal flow graphs has only been discussed at an
                 intuitive level. In this paper we equip them with a
                 structural operational semantics. As is typically the
                 case, the purely operational picture is too concrete
                 --- two graphs that are denotationally equal may
                 exhibit different operational behaviour. We classify
                 the ways in which this can occur and show that any
                 graph can be realised --- rewritten, using the
                 graphical theory, into an executable form where the
                 operational behavior and the denotation coincides.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Hinze:2015:CHM,
  author =       "Ralf Hinze and Nicolas Wu and Jeremy Gibbons",
  title =        "Conjugate Hylomorphisms --- Or: The Mother of All
                 Structured Recursion Schemes",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "527--538",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676989",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The past decades have witnessed an extensive study of
                 structured recursion schemes. A general scheme is the
                 hylomorphism, which captures the essence of
                 divide-and-conquer: a problem is broken into
                 sub-problems by a coalgebra; sub-problems are solved
                 recursively; the sub-solutions are combined by an
                 algebra to form a solution. In this paper we develop a
                 simple toolbox for assembling recursive coalgebras,
                 which by definition ensure that their hylo equations
                 have unique solutions, whatever the algebra. Our main
                 tool is the conjugate rule, a generic rule parametrized
                 by an adjunction and a conjugate pair of natural
                 transformations. We show that many basic adjunctions
                 induce useful recursion schemes. In fact, almost every
                 structured recursion scheme seems to arise as an
                 instance of the conjugate rule. Further, we adapt our
                 toolbox to the more expressive setting of
                 parametrically recursive coalgebras, where the original
                 input is also passed to the algebra. The formal
                 development is complemented by a series of worked-out
                 examples in Haskell.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Chatterjee:2015:QIA,
  author =       "Krishnendu Chatterjee and Andreas Pavlogiannis and
                 Yaron Velner",
  title =        "Quantitative Interprocedural Analysis",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "539--551",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676968",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We consider the quantitative analysis problem for
                 interprocedural control-flow graphs (ICFGs). The input
                 consists of an ICFG, a positive weight function that
                 assigns every transition a positive integer-valued
                 number, and a labelling of the transitions (events) as
                 good, bad, and neutral events. The weight function
                 assigns to each transition a numerical value that
                 represents a measure of how good or bad an event is.
                 The quantitative analysis problem asks whether there is
                 a run of the ICFG where the ratio of the sum of the
                 numerical weights of good events versus the sum of
                 weights of bad events in the long-run is at least a
                 given threshold (or equivalently, to compute the
                 maximal ratio among all valid paths in the ICFG). The
                 quantitative analysis problem for ICFGs can be solved
                 in polynomial time, and we present an efficient and
                 practical algorithm for the problem. We show that
                 several problems relevant for static program analysis,
                 such as estimating the worst-case execution time of a
                 program or the average energy consumption of a mobile
                 application, can be modeled in our framework. We have
                 implemented our algorithm as a tool in the Java Soot
                 framework. We demonstrate the effectiveness of our
                 approach with two case studies. First, we show that our
                 framework provides a sound approach (no false
                 positives) for the analysis of inefficiently-used
                 containers. Second, we show that our approach can also
                 be used for static profiling of programs which reasons
                 about methods that are frequently invoked. Our
                 experimental results show that our tool scales to
                 relatively large benchmarks, and discovers relevant and
                 useful information that can be used to optimize
                 performance of the programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Bastani:2015:SIU,
  author =       "Osbert Bastani and Saswat Anand and Alex Aiken",
  title =        "Specification Inference Using Context-Free Language
                 Reachability",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "553--566",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676977",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a framework for computing context-free
                 language reachability properties when parts of the
                 program are missing. Our framework infers candidate
                 specifications for missing program pieces that are
                 needed for verifying a property of interest, and
                 presents these specifications to a human auditor for
                 validation. We have implemented this framework for a
                 taint analysis of Android apps that relies on
                 specifications for Android library methods. In an
                 extensive experimental study on 179 apps, our tool
                 performs verification with only a small number of
                 queries to a human auditor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Elango:2015:CDA,
  author =       "Venmugil Elango and Fabrice Rastello and
                 Louis-No{\"e}l Pouchet and J. Ramanujam and P.
                 Sadayappan",
  title =        "On Characterizing the Data Access Complexity of
                 Programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "567--580",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677010",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Technology trends will cause data movement to account
                 for the majority of energy expenditure and execution
                 time on emerging computers. Therefore, computational
                 complexity will no longer be a sufficient metric for
                 comparing algorithms, and a fundamental
                 characterization of data access complexity will be
                 increasingly important. The problem of developing lower
                 bounds for data access complexity has been modeled
                 using the formalism of Hong and Kung's red/blue pebble
                 game for computational directed acyclic graphs (CDAGs).
                 However, previously developed approaches to lower
                 bounds analysis for the red/blue pebble game are very
                 limited in effectiveness when applied to CDAGs of real
                 programs, with computations comprised of multiple
                 sub-computations with differing DAG structure. We
                 address this problem by developing an approach for
                 effectively composing lower bounds based on graph
                 decomposition. We also develop a static analysis
                 algorithm to derive the asymptotic data-access lower
                 bounds of programs, as a function of the problem size
                 and cache size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Agten:2015:SMV,
  author =       "Pieter Agten and Bart Jacobs and Frank Piessens",
  title =        "Sound Modular Verification of {C} Code Executing in an
                 Unverified Context",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "581--594",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676972",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the past decade, great progress has been made in
                 the static modular verification of C code by means of
                 separation logic-based program logics. However, the
                 runtime guarantees offered by such verification are
                 relatively limited when the verified modules are part
                 of a whole program that also contains unverified
                 modules. In particular, a memory safety error in an
                 unverified module can corrupt the runtime state,
                 leading to assertion failures or invalid memory
                 accesses in the verified modules. This paper develops
                 runtime checks to be inserted at the boundary between
                 the verified and the unverified part of a program, to
                 guarantee that no assertion failures or invalid memory
                 accesses can occur at runtime in any verified module.
                 One of the key challenges is enforcing the separation
                 logic frame rule, which we achieve by checking the
                 integrity of the footprint of the verified part of the
                 program on each control flow transition from the
                 unverified to the verified part. This in turn requires
                 the presence of some support for module-private memory
                 at runtime. We formalize our approach and prove
                 soundness. We implement the necessary runtime checks by
                 means of a program transformation that translates C
                 code with separation logic annotations into plain C,
                 and that relies on a protected module architecture for
                 providing module-private memory and restricted module
                 entry points. Benchmarks show the performance impact of
                 this transformation depends on the choice of boundary
                 between the verified and unverified parts of the
                 program, but is below 4\% for real-world
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Gu:2015:DSC,
  author =       "Ronghui Gu and J{\'e}r{\'e}mie Koenig and Tahina
                 Ramananandro and Zhong Shao and Xiongnan (Newman) Wu
                 and Shu-Chun Weng and Haozhong Zhang and Yu Guo",
  title =        "Deep Specifications and Certified Abstraction Layers",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "595--608",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676975",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern computer systems consist of a multitude of
                 abstraction layers (e.g., OS kernels, hypervisors,
                 device drivers, network protocols), each of which
                 defines an interface that hides the implementation
                 details of a particular set of functionality. Client
                 programs built on top of each layer can be understood
                 solely based on the interface, independent of the layer
                 implementation. Despite their obvious importance,
                 abstraction layers have mostly been treated as a system
                 concept; they have almost never been formally specified
                 or verified. This makes it difficult to establish
                 strong correctness properties, and to scale program
                 verification across multiple layers. In this paper, we
                 present a novel language-based account of abstraction
                 layers and show that they correspond to a strong form
                 of abstraction over a particularly rich class of
                 specifications which we call deep specifications. Just
                 as data abstraction in typed functional languages leads
                 to the important representation independence property,
                 abstraction over deep specification is characterized by
                 an important implementation independence property: any
                 two implementations of the same deep specification must
                 have contextually equivalent behaviors. We present a
                 new layer calculus showing how to formally specify,
                 program, verify, and compose abstraction layers. We
                 show how to instantiate the layer calculus in realistic
                 programming languages such as C and assembly, and how
                 to adapt the CompCert verified compiler to compile
                 certified C layers such that they can be linked with
                 assembly layers. Using these new languages and tools,
                 we have successfully developed multiple certified OS
                 kernels in the Coq proof assistant, the most realistic
                 of which consists of 37 abstraction layers, took less
                 than one person year to develop, and can boot a version
                 of Linux as a guest.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Chlipala:2015:NIM,
  author =       "Adam Chlipala",
  title =        "From Network Interface to Multithreaded {Web}
                 Applications: a Case Study in Modular Program
                 Verification",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "609--622",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677003",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many verifications of realistic software systems are
                 monolithic, in the sense that they define single global
                 invariants over complete system state. More modular
                 proof techniques promise to support reuse of component
                 proofs and even reduce the effort required to verify
                 one concrete system, just as modularity simplifies
                 standard software development. This paper reports on
                 one case study applying modular proof techniques in the
                 Coq proof assistant. To our knowledge, it is the first
                 modular verification certifying a system that combines
                 infrastructure with an application of interest to end
                 users. We assume a nonblocking API for managing TCP
                 networking streams, and on top of that we work our way
                 up to certifying multithreaded, database-backed Web
                 applications. Key verified components include a
                 cooperative threading library and an implementation of
                 a domain-specific language for XML processing. We have
                 deployed our case-study system on mobile robots, where
                 it interfaces with off-the-shelf components for
                 sensing, actuation, and control.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Crary:2015:CRM,
  author =       "Karl Crary and Michael J. Sullivan",
  title =        "A Calculus for Relaxed Memory",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "623--636",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676984",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a new approach to programming multi-core,
                 relaxed-memory architectures in imperative, portable
                 programming languages. Our memory model is based on
                 explicit, programmer-specified requirements for order
                 of execution and the visibility of writes. The compiler
                 then realizes those requirements in the most efficient
                 manner it can. This is in contrast to existing memory
                 models, which---if they allow programmer control over
                 synchronization at all---are based on inferring the
                 execution and visibility consequences of
                 synchronization operations or annotations in the code.
                 We formalize our memory model in a core calculus called
                 RMC\@. Outside of the programmer's specified
                 requirements, RMC is designed to be strictly more
                 relaxed than existing architectures. It employs an
                 aggressively nondeterministic semantics for
                 expressions, in which actions can be executed in nearly
                 any order, and a store semantics that generalizes
                 Sarkar et al.'s and Alglave et al.'s models of the
                 Power architecture. We establish several results for
                 RMC, including sequential consistency for two
                 programming disciplines, and an appropriate notion of
                 type safety. All our results are formalized in Coq.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Jung:2015:IMI,
  author =       "Ralf Jung and David Swasey and Filip Sieczkowski and
                 Kasper Svendsen and Aaron Turon and Lars Birkedal and
                 Derek Dreyer",
  title =        "{Iris}: Monoids and Invariants as an Orthogonal Basis
                 for Concurrent Reasoning",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "637--650",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676980",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Iris, a concurrent separation logic with a
                 simple premise: monoids and invariants are all you
                 need. Partial commutative monoids enable us to
                 express---and invariants enable us to
                 enforce---user-defined *protocols* on shared state,
                 which are at the conceptual core of most recent program
                 logics for concurrency. Furthermore, through a novel
                 extension of the concept of a *view shift*, Iris
                 supports the encoding of *logically atomic
                 specifications*, i.e., Hoare-style specs that permit
                 the client of an operation to treat the operation
                 essentially as if it were atomic, even if it is not.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Bouajjani:2015:TRC,
  author =       "Ahmed Bouajjani and Michael Emmi and Constantin Enea
                 and Jad Hamza",
  title =        "Tractable Refinement Checking for Concurrent Objects",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "651--662",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677002",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Efficient implementations of concurrent objects such
                 as semaphores, locks, and atomic collections are
                 essential to modern computing. Yet programming such
                 objects is error prone: in minimizing the
                 synchronization overhead between concurrent object
                 invocations, one risks the conformance to reference
                 implementations --- or in formal terms, one risks
                 violating observational refinement. Testing this
                 refinement even within a single execution is
                 intractable, limiting existing approaches to executions
                 with very few object invocations. We develop a
                 polynomial-time (per execution) approximation to
                 refinement checking. The approximation is parameterized
                 by an accuracy k \in N representing the degree to which
                 refinement violations are visible. In principle, more
                 violations are detectable as k increases, and in the
                 limit, all are detectable. Our insight for this
                 approximation arises from foundational properties on
                 the partial orders characterizing the happens-before
                 relations between object invocations: they are interval
                 orders, with a well defined measure of complexity,
                 i.e., their length. Approximating the happens-before
                 relation with a possibly-weaker interval order of
                 bounded length can be efficiently implemented by
                 maintaining a bounded number of integer counters. In
                 practice, we find that refinement violations can be
                 detected with very small values of k, and that our
                 approach scales far beyond existing refinement-checking
                 approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Padon:2015:DSP,
  author =       "Oded Padon and Neil Immerman and Aleksandr Karbyshev
                 and Ori Lahav and Mooly Sagiv and Sharon Shoham",
  title =        "Decentralizing {SDN} Policies",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "663--676",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676990",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software-defined networking (SDN) is a new paradigm
                 for operating and managing computer networks. SDN
                 enables logically-centralized control over network
                 devices through a ``controller'' --- software that
                 operates independently of the network hardware. Network
                 operators can run both in-house and third-party SDN
                 programs on top of the controller, e.g., to specify
                 routing and access control policies. In practice,
                 having the controller handle events limits the network
                 scalability. Therefore, the feasibility of SDN depends
                 on the ability to efficiently decentralize network
                 event-handling by installing forwarding rules on the
                 switches. However, installing a rule too early or too
                 late may lead to incorrect behavior, e.g., (1) packets
                 may be forwarded to the wrong destination or
                 incorrectly dropped; (2) packets handled by the switch
                 may hide vital information from the controller, leading
                 to incorrect forwarding behavior. The second issue is
                 subtle and sometimes missed even by experienced
                 programmers. The contributions of this paper are two
                 fold. First, we formalize the correctness and
                 optimality requirements for decentralizing network
                 policies. Second, we identify a useful class of network
                 policies which permits automatic synthesis of a
                 controller which performs optimal forwarding rule
                 installation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Cochran:2015:PBP,
  author =       "Robert A. Cochran and Loris D'Antoni and Benjamin
                 Livshits and David Molnar and Margus Veanes",
  title =        "Program Boosting: Program Synthesis via
                 Crowd-Sourcing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "677--688",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2676973",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "In this paper, we investigate an approach to program
                 synthesis that is based on crowd-sourcing. With the
                 help of crowd-sourcing, we aim to capture the ``wisdom
                 of the crowds'' to find good if not perfect solutions
                 to inherently tricky programming tasks, which elude
                 even expert developers and lack an easy-to-formalize
                 specification. We propose an approach we call program
                 boosting, which involves crowd-sourcing imperfect
                 solutions to a difficult programming problem from
                 developers and then blending these programs together in
                 a way that improves their correctness. We implement
                 this approach in a system called CROWDBOOST and show in
                 our experiments that interesting and highly non-trivial
                 tasks such as writing regular expressions for URLs or
                 email addresses can be effectively crowd-sourced. We
                 demonstrate that carefully blending the crowd-sourced
                 results together consistently produces a boost,
                 yielding results that are better than any of the
                 starting programs. Our experiments on 465 program pairs
                 show consistent boosts in accuracy and demonstrate that
                 program boosting can be performed at a relatively
                 modest monetary cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Delaware:2015:FDS,
  author =       "Benjamin Delaware and Cl{\'e}ment Pit-Claudel and
                 Jason Gross and Adam Chlipala",
  title =        "{Fiat}: Deductive Synthesis of Abstract Data Types in
                 a Proof Assistant",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "1",
  pages =        "689--700",
  month =        jan,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775051.2677006",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Fiat, a library for the Coq proof assistant
                 supporting refinement of declarative specifications
                 into efficient functional programs with a high degree
                 of automation. Each refinement process leaves a proof
                 trail, checkable by the normal Coq kernel, justifying
                 its soundness. We focus on the synthesis of abstract
                 data types that package methods with private data. We
                 demonstrate the utility of our framework by applying it
                 to the synthesis of query structures --- abstract data
                 types with SQL-like query and insert operations. Fiat
                 includes a library for writing specifications of query
                 structures in SQL-inspired notation, expressing
                 operations over relations (tables) in terms of
                 mathematical sets. This library includes a suite of
                 tactics for automating the refinement of specifications
                 into efficient, correct-by-construction OCaml code.
                 Using these tactics, a programmer can generate such an
                 implementation completely automatically by only
                 specifying the equivalent of SQL indexes, data
                 structures capturing useful views of the abstract data.
                 Throughout we speculate on the new programming
                 modularity possibilities enabled by an automated
                 refinement system with proved-correct rules.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '15 conference proceedings.",
}

@Article{Hanenberg:2015:WDW,
  author =       "Stefan Hanenberg",
  title =        "Why do we know so little about programming languages,
                 and what would have happened if we had known more?",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "1--1",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661102",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming language research in the last decades was
                 mainly driven by mathematical methods (such as formal
                 semantics, correctness proofs, type soundness proofs,
                 etc.) or run-time arguments based on benchmark tests.
                 This happened despite the frequent discussion over
                 programming language usability. We have now been
                 through decade after decade of one language after
                 another dominating the field, forcing companies to
                 switch languages and migrate libraries. Now that
                 Javascript seems to be the next language to dominate,
                 people start to ask old questions anew. The first goal
                 of this talk is to discuss why the application of
                 empirical methods is (still) relatively rare in PL
                 research, and to discuss what could be done in
                 empirical methods to make them a substantial part of PL
                 research. The second goal is to speculate about the
                 possible effects that concrete empirical knowledge
                 could have had on the programming language community.
                 For example, what would have happened to programming
                 languages if current knowledge would have been
                 available 30 years ago? What if knowledge about
                 programming languages from the year 2050 would be
                 available today?",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Neto:2015:SOS,
  author =       "Lourival Vieira Neto and Roberto Ierusalimschy and Ana
                 L{\'u}cia de Moura and Marc Balmer",
  title =        "Scriptable operating systems with {Lua}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "2--10",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661096",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Extensible operating system is a design based on the
                 idea that operating systems can be adapted to meet user
                 requirements by allowing user extensions. In a
                 different scenario, that of application development,
                 there is a paradigm that supports that complex systems
                 should allow users to write scripts to tailor an
                 application to their needs. In this paper we propose
                 the concept of scriptable operating system, which
                 applies scripting development paradigm to the concept
                 of extensible operating systems. Scriptable operating
                 systems support that operating systems can adequately
                 provide extensibility by allowing users to script their
                 kernel. We also present an implementation of a
                 kernel-scripting environment that allows users to
                 dynamically extend Linux and NetBSD operating systems
                 using the scripting language Lua. To evaluate this
                 environment, we extended both OS kernels to allow users
                 to script CPU frequency scaling and network packet
                 filtering using Lua.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Johnson:2015:AAC,
  author =       "James Ian Johnson and David {Van Horn}",
  title =        "Abstracting abstract control",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "11--22",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661098",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The strength of a dynamic language is also its
                 weakness: run-time flexibility comes at the cost of
                 compile-time predictability. Many of the hallmarks of
                 dynamic languages such as closures, continuations,
                 various forms of reflection, and a lack of static types
                 make many programmers rejoice, while compiler writers,
                 tool developers, and verification engineers lament. The
                 dynamism of these features simply confounds statically
                 reasoning about programs that use them. Consequently,
                 static analyses for dynamic languages are few, far
                 between, and seldom sound. The ``abstracting abstract
                 machines'' (AAM) approach to constructing static
                 analyses has recently been proposed as a method to
                 ameliorate the difficulty of designing analyses for
                 such language features. The approach, so called because
                 it derives a function for the sound and computable
                 approximation of program behavior starting from the
                 abstract machine semantics of a language, provides a
                 viable approach to dynamic language analysis since all
                 that is required is a machine description of the
                 interpreter. The AAM recipe as originally described
                 produces finite state abstractions: the behavior of a
                 program is approximated as a finite state machine. Such
                 a model is inherently imprecise when it comes to
                 reasoning about the control stack of the interpreter: a
                 finite state machine cannot faithfully represent a
                 stack. Recent advances have shown that higher-order
                 programs can be approximated with pushdown systems.
                 However, such models, founded in automata theory,
                 either breakdown or require significant engineering in
                 the face of dynamic language features that inspect or
                 modify the control stack. In this paper, we tackle the
                 problem of bringing pushdown flow analysis to the
                 domain of dynamic language features. We revise the
                 abstracting abstract machines technique to target the
                 stronger computational model of pushdown systems. In
                 place of automata theory, we use only abstract machines
                 and memoization. As case studies, we show the technique
                 applies to a language with closures, garbage
                 collection, stack-inspection, and first-class
                 composable continuations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Strickland:2015:CDS,
  author =       "T. Stephen Strickland and Brianna M. Ren and Jeffrey
                 S. Foster",
  title =        "Contracts for domain-specific languages in {Ruby}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "23--34",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661092",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper concerns object-oriented embedded DSLs,
                 which are popular in the Ruby community but have
                 received little attention in the research literature.
                 Ruby DSLs implement language keywords as implicit
                 method calls to self; language structure is enforced by
                 adjusting which object is bound to self in different
                 scopes. While Ruby DSLs are powerful and elegant, they
                 suffer from a lack of specification. In this paper, we
                 introduce contracts for Ruby DSLs, which allow us to
                 attribute blame appropriately when there are
                 inconsistencies between an implementation and client.
                 We formalize Ruby DSL contract checking in DSL, a core
                 calculus that uses premethods with instance evaluation
                 to enforce contracts. We then describe RDL, an
                 implementation of Ruby DSL contracts. Finally, we
                 present two tools that automatically infer RDL
                 contracts: TypeInfer infers simple, type-like contracts
                 based on observed method calls, and DSLInfer infers DSL
                 keyword scopes and nesting by generating and testing
                 candidate DSL usages based on initial examples. The
                 type contracts generated by TypeInfer work well enough,
                 though they are limited in precision by the small
                 number of tests, while DSLInfer finds almost all DSL
                 structure. Our goal is to help users understand a DSL
                 from example programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Disney:2015:SYJ,
  author =       "Tim Disney and Nathan Faubion and David Herman and
                 Cormac Flanagan",
  title =        "Sweeten your {JavaScript}: hygienic macros for {ES5}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "35--44",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661097",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Lisp and Scheme have demonstrated the power of macros
                 to enable programmers to evolve and craft languages. In
                 languages with more complex syntax, macros have had
                 less success. In part, this has been due to the
                 difficulty in building expressive hygienic macro
                 systems for such languages. JavaScript in particular
                 presents unique challenges for macro systems due to
                 ambiguities in the lexing stage that force the
                 JavaScript lexer and parser to be intertwined. In this
                 paper we present a novel solution to the lexing
                 ambiguity of JavaScript that enables us to cleanly
                 separate the JavaScript lexer and parser by recording
                 enough history during lexing to resolve ambiguities. We
                 give an algorithm for this solution along with a proof
                 that it does in fact correctly resolve ambiguities in
                 the language. Though the algorithm and proof we present
                 is specific to JavaScript, the general technique can be
                 applied to other languages with ambiguous grammars.
                 With lexer and parser separated, we then implement an
                 expressive hygienic macro system for JavaScript called
                 sweet.js.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Vitousek:2015:DEG,
  author =       "Michael M. Vitousek and Andrew M. Kent and Jeremy G.
                 Siek and Jim Baker",
  title =        "Design and evaluation of gradual typing for {Python}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "45--56",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661101",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Combining static and dynamic typing within the same
                 language offers clear benefits to programmers. It
                 provides dynamic typing in situations that require
                 rapid prototyping, heterogeneous data structures, and
                 reflection, while supporting static typing when safety,
                 modularity, and efficiency are primary concerns. Siek
                 and Taha (2006) introduced an approach to combining
                 static and dynamic typing in a fine-grained manner
                 through the notion of type consistency in the static
                 semantics and run-time casts in the dynamic semantics.
                 However, many open questions remain regarding the
                 semantics of gradually typed languages. In this paper
                 we present Reticulated Python, a system for
                 experimenting with gradual-typed dialects of Python.
                 The dialects are syntactically identical to Python 3
                 but give static and dynamic semantics to the type
                 annotations already present in Python 3. Reticulated
                 Python consists of a typechecker and a source-to-source
                 translator from Reticulated Python to Python 3. Using
                 Reticulated Python, we evaluate a gradual type system
                 and three approaches to the dynamic semantics of
                 mutable objects: the traditional semantics based on
                 Siek and Taha (2007) and Herman et al. (2007) and two
                 new designs. We evaluate these designs in the context
                 of several third-party Python programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Freudenberg:2015:SMP,
  author =       "Bert Freudenberg and Dan H. H. Ingalls and Tim
                 Felgentreff and Tobias Pape and Robert Hirschfeld",
  title =        "{SqueakJS}: a modern and practical smalltalk that runs
                 in any browser",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "57--66",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661100",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "We report our experience in implementing SqueakJS, a
                 bit-compatible implementation of Squeak/Smalltalk
                 written in pure JavaScript. SqueakJS runs entirely in
                 the Web browser with a virtual filesystem that can be
                 directed to a server or client-side storage. Our
                 implementation is notable for simplicity and
                 performance gained through adaptation to the host
                 object memory and deployment leverage gained through
                 the Lively Web development environment. We present
                 several novel techniques as well as performance
                 measurements for the resulting virtual machine. Much of
                 this experience is potentially relevant to preserving
                 other dynamic language systems and making them
                 available in a browser-based environment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Aigner:2015:AJE,
  author =       "Martin Aigner and Thomas H{\"u}tter and Christoph M.
                 Kirsch and Alexander Miller and Hannes Payer and Mario
                 Preishuber",
  title =        "{ACDC-JS}: explorative benchmarking of {JavaScript}
                 memory management",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "67--78",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661089",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "We present ACDC-JS, an open-source JavaScript memory
                 management benchmarking tool. ACDC-JS incorporates a
                 heap model based on real web applications and may be
                 configured to expose virtually any relevant performance
                 characteristics of JavaScript memory management
                 systems. ACDC-JS is based on ACDC, a benchmarking tool
                 for C/C++ that models periodic allocation and
                 deallocation behavior (AC) as well as persistent memory
                 (DC). We identify important characteristics of
                 JavaScript mutator behavior and propose a configurable
                 heap model based on typical distributions of these
                 characteristics as foundation for ACDC-JS. We describe
                 heap analyses of 13 real web applications extending
                 existing work on JavaScript behavior analysis. Our
                 experimental results show that ACDC-JS enables
                 performance benchmarking and debugging of
                 state-of-the-art JavaScript virtual machines such as V8
                 and SpiderMonkey by exposing key aspects of their
                 memory management performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Kotthaus:2015:DPS,
  author =       "Helena Kotthaus and Ingo Korb and Michael Engel and
                 Peter Marwedel",
  title =        "Dynamic page sharing optimization for the {R}
                 language",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "79--90",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661094",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic languages such as R are increasingly used to
                 process large data sets. Here, the R interpreter
                 induces a large memory overhead due to wasteful memory
                 allocation policies. If an application's working set
                 exceeds the available physical memory, the OS starts to
                 swap, resulting in slowdowns of a several orders of
                 magnitude. Thus, memory optimizations for R will be
                 beneficial to many applications. Existing R
                 optimizations are mostly based on dynamic compilation
                 or native libraries. Both methods are futile when the
                 OS starts to page out memory. So far, only a few,
                 data-type or application specific memory optimizations
                 for R exist. To remedy this situation, we present a
                 low-overhead page sharing approach for R that
                 significantly reduces the interpreter's memory
                 overhead. Concentrating on the most rewarding
                 optimizations avoids the high runtime overhead of
                 existing generic approaches for memory deduplication or
                 compression. In addition, by applying knowledge of
                 interpreter data structures and memory allocation
                 patterns, our approach is not constrained to specific R
                 applications and is transparent to the R interpreter.
                 Our page sharing optimization enables us to reduce the
                 memory consumption by up to 53.5\% with an average of
                 18.0\% for a set of real-world R benchmarks with a
                 runtime overhead of only 5.3\% on average. In cases
                 where page I/O can be avoided, significant speedups are
                 achieved.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Khan:2015:UJW,
  author =       "Faiz Khan and Vincent Foley-Bourgon and Sujay
                 Kathrotia and Erick Lavoie and Laurie Hendren",
  title =        "Using {JavaScript} and {WebCL} for numerical
                 computations: a comparative study of native and web
                 technologies",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "91--102",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661090",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "From its modest beginnings as a tool to validate
                 forms, JavaScript is now an industrial-strength
                 language used to power online applications such as
                 spreadsheets, IDEs, image editors and even 3D games.
                 Since all modern web browsers support JavaScript, it
                 provides a medium that is both easy to distribute for
                 developers and easy to access for users. This paper
                 provides empirical data to answer the question: Is
                 JavaScript fast enough for numerical computations? By
                 measuring and comparing the runtime performance of
                 benchmarks representative of a wide variety of
                 scientific applications, we show that sequential
                 JavaScript is within a factor of 2 of native code.
                 Parallel code using WebCL shows speed improvements of
                 up to 2.28 over JavaScript for the majority of the
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Rhodes:2015:DDO,
  author =       "Dustin Rhodes and Tim Disney and Cormac Flanagan",
  title =        "Dynamic detection of object capability violations
                 through model checking",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "103--112",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661099",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper we present a new tool called DOCaT
                 (Dynamic Object Capability Tracer), a model checker for
                 JavaScript that detects capability leaks in an object
                 capability system. DOCaT includes an editor that
                 highlights the sections of code that can be potentially
                 transferred to untrusted third-party code along with a
                 trace showing how the code could be leaked in an actual
                 execution. This code highlighting provides a simple way
                 of visualizing the references untrusted code
                 potentially has access to and helps programmers to
                 discover if their code is leaking more capabilities
                 then required. DOCaT is implemented using a combination
                 of source code rewriting (using Sweet.js, a JavaScript
                 macro system), dynamic behavioral intercession
                 (Proxies, introduced in ES6, the most recent version of
                 JavaScript), and model checking. Together these methods
                 are able to locate common ways for untrusted code to
                 elevate its authority.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Steinert:2015:OVS,
  author =       "Bastian Steinert and Lauritz Thamsen and Tim
                 Felgentreff and Robert Hirschfeld",
  title =        "Object versioning to support recovery needs: using
                 proxies to preserve previous development states in
                 lively",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "113--124",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661093",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "We present object versioning as a generic approach to
                 preserve access to previous development and application
                 states. Version-aware references can manage the
                 modifications made to the target object and record
                 versions as desired. Such references can be provided
                 without modifications to the virtual machine. We used
                 proxies to implement the proposed concepts and
                 demonstrate the Lively Kernel running on top of this
                 object versioning layer. This enables Lively users to
                 undo the effects of direct manipulation and other
                 programming actions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Matsakis:2015:TOJ,
  author =       "Nicholas D. Matsakis and David Herman and Dmitry
                 Lomov",
  title =        "Typed objects in {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "125--134",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661095",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "JavaScript's typed arrays have proven to be a crucial
                 API for many JS applications, particularly those
                 working with large amounts of data or emulating other
                 languages. Unfortunately, the current typed array API
                 offers no means of abstraction. Programmers are
                 supplied with a simple byte buffer that can be viewed
                 as an array of integers or floats, but nothing more.
                 This paper presents a generalization of the typed
                 arrays API entitled typed objects. The typed objects
                 API is slated for inclusion in the upcoming ES7
                 standard. The API gives users the ability to define
                 named types, making typed arrays much easier to work
                 with. In particular, it is often trivial to replace
                 uses of existing JavaScript objects with typed objects,
                 resulting in better memory consumption and more
                 predictable performance. The advantages of the typed
                 object specification go beyond convenience, however. By
                 supporting opacity---that is, the ability to deny
                 access to the raw bytes of a typed object---the new
                 typed object specification makes it possible to store
                 objects as well as scalar data and also enables more
                 optimization by JIT compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Callau:2015:UTP,
  author =       "Oscar Calla{\'u} and Romain Robbes and {\'E}ric Tanter
                 and David R{\"o}thlisberger and Alexandre Bergel",
  title =        "On the use of type predicates in object-oriented
                 software: the case of smalltalk",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "2",
  pages =        "135--146",
  month =        feb,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775052.2661091",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:21 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Object-orientation relies on polymorphism to express
                 behavioral variants. As opposed to traditional
                 procedural design, explicit type-based conditionals
                 should be avoided. This message is conveyed in
                 introductory material on object orientation, as well as
                 in object-oriented reengineering patterns. Is this
                 principle followed in practice? In other words, are
                 type predicates actually used in object-oriented
                 software, and if so, to which extent? Answering these
                 questions will assist practitioners and researchers
                 with providing information about the state of the
                 practice, and informing the active research program of
                 retrofitting type systems, clarifying whether complex
                 flow-sensitive typing approaches are necessary. Other
                 areas, such as refactoring and teaching object
                 orientation, can also benefit from empirical evidence
                 on the matter. We report on a study of the use of type
                 predicates in a large base of over 4 million lines of
                 Smalltalk code. Our study shows that type predicates
                 are in fact widely used to do explicit type dispatch,
                 suggesting that flow-sensitive typing approaches are
                 necessary for a type system retrofitted for a dynamic
                 object-oriented language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '14 conference proceedings.",
}

@Article{Jarvi:2015:SPH,
  author =       "Jaakko J{\"a}rvi and Gabriel Foust and Magne
                 Haveraaen",
  title =        "Specializing planners for hierarchical multi-way
                 dataflow constraint systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "1--10",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658762",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A constraint system consists of variables and a set of
                 constraints on those variables. To solve a constraint
                 system is to find a valuation that satisfies all
                 constraints; or the ``best'' subset of constraints if
                 not all can simultaneously be satisfied. In a multi-way
                 dataflow constraint system, solving requires selecting
                 a set of user-defined functions which, when executed,
                 will enforce the constraints. The task of selecting
                 these functions is called planning. The planner has two
                 kinds of input: the specification of the constraints
                 and an order of priority for those constraints. The
                 former typically changes seldom, while the latter
                 frequently, making constraint planning a potential
                 application for program specialization. This paper
                 shows how to generate specialized planners for
                 hierarchical multi-way dataflow constraint systems when
                 the constraints are known in advance. The specialized
                 planners are DFAs; they can be an order of magnitude or
                 more faster than a general purpose planner for the same
                 system. Our applications for constraint systems are in
                 user interface programming, where constraint systems
                 determine how a GUI should react to user
                 interaction---specialized planners can help to ensure
                 that GUIs' responses to user interaction are
                 instantaneous.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Steindorfer:2015:CSM,
  author =       "Michael J. Steindorfer and Jurgen J. Vinju",
  title =        "Code specialization for memory efficient hash tries
                 (short paper)",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "11--14",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658763",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The hash trie data structure is a common part in
                 standard collection libraries of JVM programming
                 languages such as Clojure and Scala. It enables fast
                 immutable implementations of maps, sets, and vectors,
                 but it requires considerably more memory than an
                 equivalent array-based data structure. This hinders the
                 scalability of functional programs and the further
                 adoption of this otherwise attractive style of
                 programming. In this paper we present a product family
                 of hash tries. We generate Java source code to
                 specialize them using knowledge of JVM object memory
                 layout. The number of possible specializations is
                 exponential. The optimization challenge is thus to find
                 a minimal set of variants which lead to a maximal loss
                 in memory footprint on any given data. Using a set of
                 experiments we measured the distribution of internal
                 tree node sizes in hash tries. We used the results as a
                 guidance to decide which variants of the family to
                 generate and which variants should be left to the
                 generic implementation. A preliminary validating
                 experiment on the implementation of sets and maps shows
                 that this technique leads to a median decrease of 55\%
                 in memory footprint for maps (and 78\% for sets), while
                 still maintaining comparable performance. Our
                 combination of data analysis and code specialization
                 proved to be effective.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Malakuti:2015:EGM,
  author =       "Somayeh Malakuti and Mehmet Aksit",
  title =        "Emergent gummy modules: modular representation of
                 emergent behavior",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "15--24",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658764",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emergent behavior is generally defined as the
                 appearance of complex behavior out of multiplicity of
                 relatively simple interactions. Nowadays, there are
                 various kinds of software systems that deal with
                 detecting the emergence of certain behavior in
                 environment, representing it in the software and
                 providing means to manipulate the behavior. Where
                 significant amount of research has been dedicated to
                 develop algorithms for detecting emergent behavior,
                 there is no dedicated attempt to provide suitable
                 linguistic abstractions to modularize emergent behavior
                 and its related concerns. This results in
                 implementations that are complex and hard to maintain.
                 In this paper, we identify three characteristic
                 features of emergent behavior, and outline the
                 shortcomings of current languages to properly program
                 and modularize emergent behavior. We introduce emergent
                 gummy modules as dedicated linguistic abstractions,
                 which facilitate defining the appearance and
                 disappearance conditions of emergent behavior as well
                 as its utilization operations as one holistic module.
                 We explain the implementation of emergent gummy modules
                 in the GummyJ language, and illustrate that they
                 improve the modularity of implementations. We represent
                 the event processing semantics of GummyJ programs in
                 UPPAAL model checker and verify their correctness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Gouseti:2015:ELI,
  author =       "Maria Gouseti and Chiel Peters and Tijs van der
                 Storm",
  title =        "Extensible language implementation with object
                 algebras (short paper)",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "25--28",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658765",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Object Algebras are a recently introduced design
                 pattern to make the implementation of recursive data
                 types more extensible. In this short paper we report
                 our experience in using Object Algebras in building a
                 realistic domain specific language (DSL) for
                 questionnaires, called QL. This experience has led to a
                 simple, yet powerful set of tools for the practical and
                 flexible implementation of highly extensible
                 languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Walkingshaw:2015:PEV,
  author =       "Eric Walkingshaw and Klaus Ostermann",
  title =        "Projectional editing of variational software",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "29--38",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658766",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Editing the source code of variational software is
                 complicated by the presence of variation annotations,
                 such as \#ifdef statements, and by code that is only
                 included in some configurations. When editing some
                 configurations and not others, it would be easier to
                 edit a simplified version of the source code that
                 includes only the configurations we currently care
                 about. In this paper, we present a projectional editing
                 model for variational software. Using our approach, a
                 programmer can partially configure a variational
                 program, edit this simplified view of the code, and
                 then automatically update the original, fully
                 variational source code. The model is based on an
                 isolation principle where edits affect only the
                 variants that are visible in the view. We show that
                 this principle has several nice properties that are
                 suggested by related work on bidirectional
                 transformations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Ruprecht:2015:AFS,
  author =       "Andreas Ruprecht and Bernhard Heinloth and Daniel
                 Lohmann",
  title =        "Automatic feature selection in large-scale
                 system-software product lines",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "39--48",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658767",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "System software can typically be configured at compile
                 time via a comfortable feature-based interface to
                 tailor its functionality towards a specific use case.
                 However, with the growing number of features, this
                 tailoring process becomes increasingly difficult: As a
                 prominent example, the Linux kernel in v3.14 provides
                 nearly 14 000 configuration options to choose from.
                 Even developers of embedded systems refrain from trying
                 to build a minimized distinctive kernel configuration
                 for their device --- and thereby waste memory and money
                 for unneeded functionality. In this paper, we present
                 an approach for the automatic use-case specific
                 tailoring of system software for special-purpose
                 embedded systems. We evaluate the effectiveness of our
                 approach on the example of Linux by generating tailored
                 kernels for well-known applications of the Rasperry Pi
                 and a Google Nexus 4 smartphone. Compared to the
                 original configurations, our approach leads to memory
                 savings of 15-70 percent and requires only very little
                 manual intervention.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Ma:2015:ETS,
  author =       "Lei Ma and Cyrille Artho and Cheng Zhang and Hiroyuki
                 Sato",
  title =        "Efficient testing of software product lines via
                 centralization (short paper)",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "49--52",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658768",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software product line~(SPL) engineering manages
                 families of software products that share common
                 features. However, cost-effective test case generation
                 for an SPL is challenging. Applying existing test case
                 generation techniques to each product variant
                 separately may test common code in a redundant way.
                 Moreover, it is difficult to share the test results
                 among multiple product variants. In this paper, we
                 propose the use of centralization, which combines
                 multiple product variants from the same SPL and
                 generates test cases for the entire system. By taking
                 into account all variants, our technique generally
                 avoids generating redundant test cases for common
                 software components. Our case study on three SPLs shows
                 that compared with testing each variant independently,
                 our technique is more efficient and achieves higher
                 test coverage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Smeltzer:2015:TAD,
  author =       "Karl Smeltzer and Martin Erwig and Ronald Metoyer",
  title =        "A transformational approach to data visualization",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "53--62",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658769",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Information visualization construction tools generally
                 tend to fall in one of two disparate categories. Either
                 they offer simple but inflexible visualization
                 templates, or else they offer low-level graphical
                 primitives which need to be assembled manually. Those
                 that do offer flexible, domain-specific abstractions
                 rarely focus on incrementally building and transforming
                 visualizations, which could reduce limitations on the
                 style of workflows supported. We present a
                 Haskell-embedded DSL for data visualization that is
                 designed to provide such abstractions and
                 transformations. This DSL achieves additional
                 expressiveness and flexibility through common
                 functional programming idioms and the Haskell type
                 class hierarchy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Shioda:2015:LLD,
  author =       "Masato Shioda and Hideya Iwasaki and Shigeyuki Sato",
  title =        "{LibDSL}: a library for developing embedded domain
                 specific languages in {D} via template
                 metaprogramming",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "63--72",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658770",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a library called LibDSL that helps
                 the implementer of an embedded domain specific language
                 (EDSL) effectively develop it in D language. The LibDSL
                 library accepts as input some kinds of
                 ``specifications'' of the EDSL that the implementer is
                 going to develop and a D program within which an EDSL
                 source program written by the user is embedded. It
                 produces the front-end code of an LALR parser for the
                 EDSL program and back-end code of the execution engine.
                 LibDSL is able to produce two kinds of execution
                 engines, namely compiler-based and interpreter-based
                 engines, either of which the user can properly choose
                 depending on whether an EDSL program is known at
                 compile time or not. We have implemented the LibDSL
                 system by using template metaprogramming and other
                 advanced facilities such as compile-time function
                 execution of D language. EDSL programs developed by
                 means of LibDSL have a nice integrativeness with the
                 host language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Jovanovic:2015:YYC,
  author =       "Vojin Jovanovic and Amir Shaikhha and Sandro Stucki
                 and Vladimir Nikolaev and Christoph Koch and Martin
                 Odersky",
  title =        "{Yin-Yang}: concealing the deep embedding of {DSLs}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "73--82",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658771",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Deeply embedded domain-specific languages (EDSLs)
                 intrinsically compromise programmer experience for
                 improved program performance. Shallow EDSLs complement
                 them by trading program performance for good programmer
                 experience. We present Yin-Yang, a framework for DSL
                 embedding that uses Scala macros to reliably translate
                 shallow EDSL programs to the corresponding deep EDSL
                 programs. The translation allows program prototyping
                 and development in the user friendly shallow embedding,
                 while the corresponding deep embedding is used where
                 performance is important. The reliability of the
                 translation completely conceals the deep em- bedding
                 from the user. For the DSL author, Yin-Yang
                 automatically generates the deep DSL embeddings from
                 their shallow counterparts by reusing the core
                 translation. This obviates the need for code
                 duplication and leads to reliability by construction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Hess:2015:ALF,
  author =       "Benjamin Hess and Thomas R. Gross and Markus
                 P{\"u}schel",
  title =        "Automatic locality-friendly interface extension of
                 numerical functions",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "83--92",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658772",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Raising the level of abstraction is a key concern of
                 software engineering, and libraries (either used
                 directly or as a target of a program generation system)
                 are a successful technique to raise programmer
                 productivity and to improve software quality.
                 Unfortunately successful libraries may contain
                 functions that may not be general enough. For example,
                 many numeric performance libraries contain functions
                 that work on one- or higher-dimensional arrays. A
                 problem arises if a program wants to invoke such a
                 function on a non-contiguous subarray (e.g., in C the
                 column of a matrix or a subarray of an image). If the
                 library developer did not foresee this scenario, the
                 client program must include explicit copy steps before
                 and after the library function call, incurring a
                 possibly high performance penalty. A better solution
                 would be an enhanced library function that allows for
                 the desired access pattern. Exposing the access pattern
                 allows the compiler to optimize for the intended usage
                 scenario(s). As we do not want the library developer to
                 generate all interesting versions manually, we present
                 a tool that takes a library function written in C and
                 generates such a customized function for typical
                 accesses. We describe the approach, discuss
                 limitations, and report on the performance. As example
                 access patterns we consider those most common in
                 numerical applications: striding and block striding,
                 general permutations, as well as scaling. We evaluate
                 the tool on various library functions including
                 filters, scans, reductions, sorting, FFTs, and linear
                 algebra operations. The automatically generated custom
                 version is in most cases significantly faster than
                 using individual steps, offering speed-ups that are
                 typically in the range of 1.2--1.8x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Kamin:2015:ORS,
  author =       "Sam Kamin and Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and
                 Baris Aktemur and Danqing Xu and Buse Yilmaz and
                 Zhongbo Chen",
  title =        "Optimization by runtime specialization for sparse
                 matrix--vector multiplication",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "93--102",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658773",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Runtime specialization optimizes programs based on
                 partial information available only at run time. It is
                 applicable when some input data is used repeatedly
                 while other input data varies. This technique has the
                 potential of generating highly efficient codes. In this
                 paper, we explore the potential for obtaining speedups
                 for sparse matrix-dense vector multiplication using
                 runtime specialization, in the case where a single
                 matrix is to be multiplied by many vectors. We
                 experiment with five methods involving runtime
                 specialization, comparing them to methods that do not
                 (including Intel's MKL library). For this work, our
                 focus is the evaluation of the speedups that can be
                 obtained with runtime specialization without
                 considering the overheads of the code generation. Our
                 experiments use 23 matrices from the Matrix Market and
                 Florida collections, and run on five different
                 machines. In 94 of those 115 cases, the specialized
                 code runs faster than any version without
                 specialization. If we only use specialization, the
                 average speedup with respect to Intel's MKL library
                 ranges from 1.44x to 1.77x, depending on the machine.
                 We have also found that the best method depends on the
                 matrix and machine; no method is best for all matrices
                 and machines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Danilewski:2015:STD,
  author =       "Piotr Danilewski and Marcel K{\"o}ster and Roland
                 Lei{\ss}a and Richard Membarth and Philipp Slusallek",
  title =        "Specialization through dynamic staging",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "103--112",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658774",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Partial evaluation allows for specialization of
                 program fragments. This can be realized by staging,
                 where one fragment is executed earlier than its
                 surrounding code. However, taking advantage of these
                 capabilities is often a cumbersome endeavor. In this
                 paper, we present a new metaprogramming concept using
                 staging parameters that are first-class citizen
                 entities and define the order of execution of the
                 program. Staging parameters can be used to define
                 MetaML-like quotations, but can also allow stages to be
                 created and resolved dynamically. The programmer can
                 write generic, polyvariant code which can be reused in
                 the context of different stages. We demonstrate how our
                 approach can be used to define and apply
                 domain-specific optimizations. Our implementation of
                 the proposed metaprogramming concept generates code
                 which is on a par with templated C++ code in terms of
                 execution time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Asai:2015:CRL,
  author =       "Kenichi Asai",
  title =        "Compiling a reflective language using {MetaOCaml}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "113--122",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658775",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A reflective language makes the language semantics
                 open to user programs and allows them to access,
                 extend, and modify it from within the same language
                 framework. Because of its high flexibility and
                 expressiveness, it can be an ideal platform for
                 programming language research as well as practical
                 applications in dynamic environments. However,
                 efficient implementation of a reflective language is
                 extremely difficult. Under the circumstance where the
                 language semantics can change, a partial evaluator is
                 required for compilation. This paper reports on the
                 experience of using MetaOCaml as a compiler for a
                 reflective language. With staging annotations,
                 MetaOCaml achieves the same effect as using a partial
                 evaluator. Unlike the standard partial evaluator, the
                 run mechanism of MetaOCaml enables us to use the
                 specialized (compiled) code in the current runtime
                 environment. On the other hand, the lack of a
                 binding-time analysis in MetaOCaml prohibits us from
                 compiling a user program under modified compiled
                 semantics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Humer:2015:DSL,
  author =       "Christian Humer and Christian Wimmer and Christian
                 Wirth and Andreas W{\"o}{\ss} and Thomas
                 W{\"u}rthinger",
  title =        "A domain-specific language for building
                 self-optimizing {AST} interpreters",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "123--132",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658776",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Self-optimizing AST interpreters dynamically adapt to
                 the provided input for faster execution. This
                 adaptation includes initial tests of the input, changes
                 to AST nodes, and insertion of guards that ensure
                 assumptions still hold. Such specialization and
                 speculation is essential for the performance of dynamic
                 programming languages such as JavaScript. In
                 traditional procedural and objectoriented programming
                 languages it can be tedious to write selfoptimizing AST
                 interpreters, as those languages fail to provide
                 constructs that would specifically support that. This
                 paper introduces a declarative domain-specific language
                 (DSL) that greatly simplifies writing self-optimizing
                 AST interpreters. The DSL supports specialization of
                 operations based on types of the input and other
                 properties. It can then use these specializations
                 directly or chain them to represent the operation with
                 the minimum amount of code possible. The DSL
                 significantly reduces the complexity of expressing
                 specializations for those interpreters. We use it in
                 our high-performance implementation of JavaScript,
                 where 274 language operations have an average of about
                 4 and a maximum of 190 specializations. In addition,
                 the DSL is used in implementations of Ruby, Python, R,
                 and Smalltalk.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Hill:2015:POO,
  author =       "James H. Hill and Dennis C. Feiock",
  title =        "{Pin++}: an object-oriented framework for writing
                 {Pintools}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "3",
  pages =        "133--141",
  month =        mar,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775053.2658777",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a framework named Pin++. Pin++ is
                 an object-oriented framework that uses template
                 metaprogramming to implement Pintools, which are
                 analysis tools for the dynamic binary instrumentation
                 tool named Pin. The goal of Pin++ is to simplify
                 programming a Pintool and promote reuse of its
                 components across different Pintools. Our results show
                 that Pintools implemented using Pin++ can have a 54\%
                 reduction in complexity, increase its modularity, and
                 up to 60\% reduction in instrumentation overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '14 conference proceedings.",
}

@Article{Ozturk:2015:ASC,
  author =       "Ozcan Ozturk",
  title =        "Architectural Support for Cyber-Physical Systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "1--1",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694375",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cyber-physical systems are integrations of
                 computation, communication networks, and physical
                 dynamics. Although time plays a central role in the
                 physical world, all widely used software abstractions
                 lack temporal semantics. The notion of correct
                 execution of a program written in every widely-used
                 programming language today does not depend on the
                 temporal behavior of the program. But temporal behavior
                 matters in almost all systems, and most particularly in
                 cyber-physical systems. In this talk, I will argue that
                 time can and must become part of the semantics of
                 programs for a large class of applications. To
                 illustrate that this is both practical and useful, we
                 will describe a recent effort at Berkeley in the design
                 and implementation of timing-centric software systems.
                 Specifically, I will describe PRET machines, which
                 redefine the instruction-set architecture (ISA) of a
                 microprocessor to embrace temporal semantics. Such
                 machines can be used in high-confidence and
                 safety-critical systems, in energy-constrained systems,
                 in mixed-criticality systems, and as a Real-Time Unit
                 (RTU) that cooperates with a general-purpose processor
                 to provide real-time services, in a manner similar to
                 how a GPU provides graphics services.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Zhang:2015:MRH,
  author =       "Yiying Zhang and Jian Yang and Amirsaman Memaripour
                 and Steven Swanson",
  title =        "{Mojim}: a Reliable and Highly-Available Non-Volatile
                 Memory System",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "3--18",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694370",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Next-generation non-volatile memories (NVMs) promise
                 DRAM-like performance, persistence, and high density.
                 They can attach directly to processors to form
                 non-volatile main memory (NVMM) and offer the
                 opportunity to build very low-latency storage systems.
                 These high-performance storage systems would be
                 especially useful in large-scale data center
                 environments where reliability and availability are
                 critical. However, providing reliability and
                 availability to NVMM is challenging, since the latency
                 of data replication can overwhelm the low latency that
                 NVMM should provide. We propose Mojim, a system that
                 provides the reliability and availability that
                 large-scale storage systems require, while preserving
                 the performance of NVMM. Mojim achieves these goals by
                 using a two-tier architecture in which the primary tier
                 contains a mirrored pair of nodes and the secondary
                 tier contains one or more secondary backup nodes with
                 weakly consistent copies of data. Mojim uses
                 highly-optimized replication protocols, software, and
                 networking stacks to minimize replication costs and
                 expose as much of NVMM?s performance as possible. We
                 evaluate Mojim using raw DRAM as a proxy for NVMM and
                 using an industrial NVMM emulation system. We find that
                 Mojim provides replicated NVMM with similar or even
                 better performance than un-replicated NVMM (reducing
                 latency by 27\% to 63\% and delivering between 0.4 to
                 2.7X the throughput). We demonstrate that replacing
                 MongoDB's built-in replication system with Mojim
                 improves MongoDB's performance by 3.4 to 4X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Wang:2015:SPC,
  author =       "Rujia Wang and Lei Jiang and Youtao Zhang and Jun
                 Yang",
  title =        "{SD-PCM}: Constructing Reliable Super Dense Phase
                 Change Memory under Write Disturbance",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "19--31",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694352",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Phase Change Memory (PCM) has better scalability and
                 smaller cell size comparing to DRAM. However, further
                 scaling PCM cell in deep sub-micron regime results in
                 significant thermal based write disturbance (WD).
                 Naively allocating large inter-cell space increases
                 cell size from 4F$^2$ ideal to 12F$^2$. While a recent
                 work mitigates WD along word-lines through disturbance
                 resilient data encoding, it is ineffective for WD along
                 bit-lines, which is more severe due to widely adopted $
                 \mu $Trench structure in constructing PCM cell arrays.
                 Without mitigating WD along bit-lines, a PCM cell still
                 has 8F2, which is 100\% larger than the ideal. In this
                 paper, we propose SD-PCM for achieving reliable write
                 operations in super dense PCM. In particular, we focus
                 on mitigating WD along bit-lines such that we can
                 construct super dense PCM chips with 4F$^2$ cell size,
                 i.e., the minimal for diode-switch based PCM. Based on
                 simple verification-n-correction (VnC), we propose
                 LazyCorrection and PreRead to effectively reduce VnC
                 overhead and minimize cascading verification during
                 write. We further propose (n:m)-Alloc for achieving
                 good tradeoff between VnC overhead minimization and
                 memory capacity loss. Our experimental results show
                 that, comparing to a WD-free low density PCM, SD-PCM
                 achieves 80\% capacity improvement in cell arrays while
                 incurring around 0-10\% performance degradation when
                 using different (n:m) allocators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Young:2015:DWE,
  author =       "Vinson Young and Prashant J. Nair and Moinuddin K.
                 Qureshi",
  title =        "{DEUCE}: Write-Efficient Encryption for Non-Volatile
                 Memories",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "33--44",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694387",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Phase Change Memory (PCM) is an emerging Non Volatile
                 Memory (NVM) technology that has the potential to
                 provide scalable high-density memory systems. While the
                 non-volatility of PCM is a desirable property in order
                 to save leakage power, it also has the undesirable
                 effect of making PCM main memories susceptible to newer
                 modes of security vulnerabilities, for example,
                 accessibility to sensitive data if a PCM DIMM gets
                 stolen. PCM memories can be made secure by encrypting
                 the data. Unfortunately, such encryption comes with a
                 significant overhead in terms of bits written to PCM
                 memory, causing half of the bits in the line to change
                 on every write, even if the actual number of bits being
                 written to memory is small. Our studies show that a
                 typical writeback modifies, on average, only 12\% of
                 the bits in the cacheline. Thus, encryption causes
                 almost a 4x increase in the number of bits written to
                 PCM memories. Such extraneous bit writes cause
                 significant increase in write power, reduction in write
                 endurance, and reduction in write bandwidth. To provide
                 the benefit of secure memory in a write efficient
                 manner this paper proposes Dual Counter Encryption
                 (DEUCE). DEUCE is based on the observation that a
                 typical writeback only changes a few words, so DEUCE
                 reencrypts only the words that have changed. We show
                 that DEUCE reduces the number of modified bits per
                 writeback for a secure memory from 50\% to 24\%, which
                 improves performance by 27\% and increases lifetime by
                 2x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Morrison:2015:TBT,
  author =       "Adam Morrison and Yehuda Afek",
  title =        "Temporally Bounding {TSO} for Fence-Free Asymmetric
                 Synchronization",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "45--58",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694374",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces a temporally bounded total store
                 ordering (TBTSO) memory model, and shows that it
                 enables nonblocking fence-free solutions to asymmetric
                 synchronization problems, such as those arising in
                 memory reclamation and biased locking. TBTSO
                 strengthens the TSO memory model by bounding the time
                 it takes a store to drain from the store buffer into
                 memory. This bound enables devising fence-free
                 algorithms for asymmetric problems, which require a
                 performance-critical fast path to synchronize with an
                 infrequently executed slow path. We demonstrate this by
                 constructing (1) a fence-free version of the hazard
                 pointers memory reclamation scheme, and (2) a
                 fence-free biased lock algorithm which is compatible
                 with unmanaged environments as it does not rely on safe
                 points or similar mechanisms. We further argue that
                 TBTSO can be implemented in hardware with modest
                 modifications to existing TSO architectures. However,
                 our design makes assumptions about proprietary
                 implementation details of commercial hardware; it thus
                 best serves as a starting point for a discussion on the
                 feasibility of hardware TBTSO implementation. We also
                 show how minimal OS support enables the adaptation of
                 TBTSO algorithms to x86 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Matveev:2015:RHN,
  author =       "Alexander Matveev and Nir Shavit",
  title =        "Reduced Hardware {NOrec}: a Safe and Scalable Hybrid
                 Transactional Memory",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "59--71",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694393",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Because of hardware TM limitations, software fallbacks
                 are the only way to make TM algorithms guarantee
                 progress. Nevertheless, all known software fallbacks to
                 date, from simple locks to sophisticated versions of
                 the NOrec Hybrid TM algorithm, have either limited
                 scalability or weakened semantics. We propose a novel
                 reduced-hardware (RH) version of the NOrec HyTM
                 algorithm. Instead of an all-software slow path, in our
                 RH NOrec the slow-path is a ``mix'' of hardware and
                 software: one short hardware transaction executes a
                 maximal amount of initial reads in the hardware, and
                 the second executes all of the writes. This novel
                 combination of the RH approach and the NOrec algorithm
                 delivers the first Hybrid TM that scales while fully
                 preserving the hardware's original semantics of opacity
                 and privatization. Our GCC implementation of RH NOrec
                 is promising in that it shows improved performance
                 relative to all prior methods, at the concurrency
                 levels we could test today.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Orr:2015:SUR,
  author =       "Marc S. Orr and Shuai Che and Ayse Yilmazer and
                 Bradford M. Beckmann and Mark D. Hill and David A.
                 Wood",
  title =        "Synchronization Using Remote-Scope Promotion",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "73--86",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694350",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous system architecture (HSA) and OpenCL
                 define scoped synchronization to facilitate low
                 overhead communication across a subset of threads.
                 Scoped synchronization works well for static sharing
                 patterns, where consumer threads are known a priori. It
                 works poorly for dynamic sharing patterns (e.g., work
                 stealing) where programmers cannot use a faster small
                 scope due to the rare possibility that the work is
                 stolen by a thread in a distant slower scope. This puts
                 programmers in a conundrum: optimize the common case by
                 synchronizing at a faster small scope or use work
                 stealing at a slower large scope. In this paper, we
                 propose to extend scoped synchronization with
                 remote-scope promotion. This allows the most frequent
                 sharers to synchronize through a small scope.
                 Infrequent sharers synchronize by promoting that remote
                 small scope to a larger shared scope. Synchronization
                 using remote-scope promotion provides performance
                 robustness for dynamic workloads, where the benefits
                 provided by scoped synchronization and work stealing
                 are hard to anticipate. Compared to a na{\"\i}ve
                 baseline, static scoped synchronization alone achieves
                 a 1.07x speedup on average and dynamic work stealing
                 alone achieves a 1.18x speedup on average. In contrast,
                 synchronization using remote-scope promotion achieves a
                 robust 1.25x speedup on average, across a diverse set
                 of graph benchmarks and inputs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Liu:2015:GHS,
  author =       "Chang Liu and Austin Harris and Martin Maas and
                 Michael Hicks and Mohit Tiwari and Elaine Shi",
  title =        "{GhostRider}: a Hardware-Software System for Memory
                 Trace Oblivious Computation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "87--101",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694385",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a new, co-designed compiler and
                 architecture called GhostRider for supporting privacy
                 preserving computation in the cloud. GhostRider ensures
                 all programs satisfy a property called memory-trace
                 obliviousness (MTO): Even an adversary that observes
                 memory, bus traffic, and access times while the program
                 executes can learn nothing about the program's
                 sensitive inputs and outputs. One way to achieve MTO is
                 to employ Oblivious RAM (ORAM), allocating all code and
                 data in a single ORAM bank, and to also disable caches
                 or fix the rate of memory traffic. This baseline
                 approach can be inefficient, and so GhostRider's
                 compiler uses a program analysis to do better,
                 allocating data to non-oblivious, encrypted RAM (ERAM)
                 and employing a scratchpad when doing so will not
                 compromise MTO. The compiler can also allocate to
                 multiple ORAM banks, which sometimes significantly
                 reduces access times.We have formalized our approach
                 and proved it enjoys MTO. Our FPGA-based hardware
                 prototype and simulation results show that GhostRider
                 significantly outperforms the baseline strategy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Fletcher:2015:FON,
  author =       "Christopher W. Fletcher and Ling Ren and Albert Kwon
                 and Marten van Dijk and Srinivas Devadas",
  title =        "Freecursive {ORAM}: [Nearly] Free Recursion and
                 Integrity Verification for Position-based Oblivious
                 {RAM}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "103--116",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694353",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Oblivious RAM (ORAM) is a cryptographic primitive that
                 hides memory access patterns as seen by untrusted
                 storage. Recently, ORAM has been architected into
                 secure processors. A big challenge for hardware ORAM
                 schemes is how to efficiently manage the Position Map
                 (PosMap), a central component in modern ORAM
                 algorithms. Implemented naively, the PosMap causes ORAM
                 to be fundamentally unscalable in terms of on-chip
                 area. On the other hand, a technique called Recursive
                 ORAM fixes the area problem yet significantly increases
                 ORAM's performance overhead. To address this challenge,
                 we propose three new mechanisms. We propose a new ORAM
                 structure called the PosMap Lookaside Buffer (PLB) and
                 PosMap compression techniques to reduce the performance
                 overhead from Recursive ORAM empirically (the latter
                 also improves the construction asymptotically). Through
                 simulation, we show that these techniques reduce the
                 memory bandwidth overhead needed to support recursion
                 by 95\%, reduce overall ORAM bandwidth by 37\% and
                 improve overall SPEC benchmark performance by 1.27x. We
                 then show how our PosMap compression techniques further
                 facilitate an extremely efficient integrity
                 verification scheme for ORAM which we call PosMap MAC
                 (PMMAC). For a practical parameterization, PMMAC
                 reduces the amount of hashing needed for integrity
                 checking by $ \geq 68 \times $ relative to prior
                 schemes and introduces only 7\% performance overhead.
                 We prototype our mechanisms in hardware and report area
                 and clock frequency for a complete ORAM design
                 post-synthesis and post-layout using an ASIC flow in a
                 32~nm commercial process. With 2 DRAM channels, the
                 design post-layout runs at 1~GHz and has a total area
                 of .47~mm2. Depending on PLB-specific parameters, the
                 PLB accounts for 10\% to 26\% area. PMMAC costs 12\% of
                 total design area. Our work is the first to prototype
                 Recursive ORAM or ORAM with any integrity scheme in
                 hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Chisnall:2015:BPA,
  author =       "David Chisnall and Colin Rothwell and Robert N. M.
                 Watson and Jonathan Woodruff and Munraj Vadera and
                 Simon W. Moore and Michael Roe and Brooks Davis and
                 Peter G. Neumann",
  title =        "Beyond the {PDP-11}: Architectural Support for a
                 Memory-Safe {C} Abstract Machine",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "117--130",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694367",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a new memory-safe interpretation of the C
                 abstract machine that provides stronger protection to
                 benefit security and debugging. Despite ambiguities in
                 the specification intended to provide implementation
                 flexibility, contemporary implementations of C have
                 converged on a memory model similar to the PDP-11, the
                 original target for C. This model lacks support for
                 memory safety despite well-documented impacts on
                 security and reliability. Attempts to change this model
                 are often hampered by assumptions embedded in a large
                 body of existing C code, dating back to the memory
                 model exposed by the original C compiler for the
                 PDP-11. Our experience with attempting to implement a
                 memory-safe variant of C on the CHERI experimental
                 microprocessor led us to identify a number of
                 problematic idioms. We describe these as well as their
                 interaction with existing memory safety schemes and the
                 assumptions that they make beyond the requirements of
                 the C specification. Finally, we refine the CHERI ISA
                 and abstract model for C, by combining elements of the
                 CHERI capability model and fat pointers, and present a
                 softcore CPU that implements a C abstract machine that
                 can run legacy C code with strong memory protection
                 guarantees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Ma:2015:SDS,
  author =       "Jiuyue Ma and Xiufeng Sui and Ninghui Sun and Yupeng
                 Li and Zihao Yu and Bowen Huang and Tianni Xu and
                 Zhicheng Yao and Yun Chen and Haibin Wang and Lixin
                 Zhang and Yungang Bao",
  title =        "Supporting Differentiated Services in Computers via
                 Programmable Architecture for Resourcing-on-Demand
                 {(PARD)}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "131--143",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694382",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "This paper presents PARD, a programmable architecture
                 for resourcing-on-demand that provides a new
                 programming interface to convey an application's
                 high-level information like quality-of-service
                 requirements to the hardware. PARD enables new
                 functionalities like fully hardware-supported
                 virtualization and differentiated services in
                 computers. PARD is inspired by the observation that a
                 computer is inherently a network in which hardware
                 components communicate via packets (e.g., over the NoC
                 or PCIe). We apply principles of software-defined
                 networking to this intra-computer network and address
                 three major challenges. First, to deal with the
                 semantic gap between high-level applications and
                 underlying hardware packets, PARD attaches a high-level
                 semantic tag (e.g., a virtual machine or thread ID) to
                 each memory-access, I/O, or interrupt packet. Second,
                 to make hardware components more manageable, PARD
                 implements programmable control planes that can be
                 integrated into various shared resources (e.g., cache,
                 DRAM, and I/O devices) and can differentially process
                 packets according to tag-based rules. Third, to
                 facilitate programming, PARD abstracts all control
                 planes as a device file tree to provide a uniform
                 programming interface via which users create and apply
                 tag-based rules. Full-system simulation results show
                 that by co-locating latency-critical memcached
                 applications with other workloads PARD can improve a
                 four-core computer's CPU utilization by up to a factor
                 of four without significantly increasing tail latency.
                 FPGA emulation based on a preliminary RTL
                 implementation demonstrates that the cache control
                 plane introduces no extra latency and that the memory
                 control plane can reduce queueing delay for
                 high-priority memory-access requests by up to a factor
                 of 5.6.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Omote:2015:IAE,
  author =       "Yushi Omote and Takahiro Shinagawa and Kazuhiko Kato",
  title =        "Improving Agility and Elasticity in Bare-metal
                 Clouds",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "145--159",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694349",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Bare-metal clouds are an emerging
                 infrastructure-as-a-service (IaaS) that leases physical
                 machines (bare-metal instances) rather than virtual
                 machines, allowing resource-intensive applications to
                 have exclusive access to physical hardware.
                 Unfortunately, bare-metal instances require
                 time-consuming or OS-specific tasks for deployment due
                 to the lack of virtualization layers, thereby
                 sacrificing several beneficial features of traditional
                 IaaS clouds such as agility, elasticity, and OS
                 transparency. We present BMcast, an OS deployment
                 system with a special-purpose de-virtualizable virtual
                 machine monitor (VMM) that supports quick and
                 OS-transparent startup of bare-metal instances. BMcast
                 performs streaming OS deployment while allowing direct
                 access to physical hardware from the guest OS, and then
                 disappears after completing the deployment. Quick
                 startup of instances improves agility and elasticity
                 significantly, and OS transparency greatly simplifies
                 management tasks for cloud customers. Experimental
                 results have confirmed that BMcast initiated a
                 bare-metal instance 8.6 times faster than image
                 copying, and database performance on BMcast during
                 streaming OS deployment was comparable to that on a
                 state-of-the-art VMM without performing deployment.
                 BMcast incurred zero overhead after
                 de-virtualization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Haque:2015:FMI,
  author =       "Md E. Haque and Yong hun Eom and Yuxiong He and Sameh
                 Elnikety and Ricardo Bianchini and Kathryn S.
                 McKinley",
  title =        "Few-to-Many: Incremental Parallelism for Reducing Tail
                 Latency in Interactive Services",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "161--175",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694384",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Interactive services, such as Web search,
                 recommendations, games, and finance, must respond
                 quickly to satisfy customers. Achieving this goal
                 requires optimizing tail (e.g., 99th+ percentile)
                 latency. Although every server is multicore,
                 parallelizing individual requests to reduce tail
                 latency is challenging because (1) service demand is
                 unknown when requests arrive; (2) blindly parallelizing
                 all requests quickly oversubscribes hardware resources;
                 and (3) parallelizing the numerous short requests will
                 not improve tail latency. This paper introduces
                 Few-to-Many (FM) incremental parallelization, which
                 dynamically increases parallelism to reduce tail
                 latency. FM uses request service demand profiles and
                 hardware parallelism in an offline phase to compute a
                 policy, represented as an interval table, which
                 specifies when and how much software parallelism to
                 add. At runtime, FM adds parallelism as specified by
                 the interval table indexed by dynamic system load and
                 request execution time progress. The longer a request
                 executes, the more parallelism FM adds. We evaluate FM
                 in Lucene, an open-source enterprise search engine, and
                 in Bing, a commercial Web search engine. FM improves
                 the 99th percentile response time up to 32\% in Lucene
                 and up to 26\% in Bing, compared to prior
                 state-of-the-art parallelization. Compared to running
                 requests sequentially in Bing, FM improves tail latency
                 by a factor of two. These results illustrate that
                 incremental parallelism is a powerful tool for reducing
                 tail latency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Colp:2015:PDS,
  author =       "Patrick Colp and Jiawen Zhang and James Gleeson and
                 Sahil Suneja and Eyal de Lara and Himanshu Raj and
                 Stefan Saroiu and Alec Wolman",
  title =        "Protecting Data on {Smartphones} and Tablets from
                 Memory Attacks",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "177--189",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694380",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Smartphones and tablets are easily lost or stolen.
                 This makes them susceptible to an inexpensive class of
                 memory attacks, such as cold-boot attacks, using a bus
                 monitor to observe the memory bus, and DMA attacks.
                 This paper describes Sentry, a system that allows
                 applications and OS components to store their code and
                 data on the System-on-Chip (SoC) rather than in DRAM.
                 We use ARM-specific mechanisms originally designed for
                 embedded systems, but still present in today's mobile
                 devices, to protect applications and OS subsystems from
                 memory attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Dautenhahn:2015:NKO,
  author =       "Nathan Dautenhahn and Theodoros Kasampalis and Will
                 Dietz and John Criswell and Vikram Adve",
  title =        "Nested Kernel: an Operating System Architecture for
                 Intra-Kernel Privilege Separation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "191--206",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694386",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Monolithic operating system designs undermine the
                 security of computing systems by allowing single
                 exploits anywhere in the kernel to enjoy full
                 supervisor privilege. The nested kernel operating
                 system architecture addresses this problem by
                 ``nesting'' a small isolated kernel within a
                 traditional monolithic kernel. The ``nested kernel''
                 interposes on all updates to virtual memory
                 translations to assert protections on physical memory,
                 thus significantly reducing the trusted computing base
                 for memory access control enforcement. We incorporated
                 the nested kernel architecture into FreeBSD on x86-64
                 hardware while allowing the entire operating system,
                 including untrusted components, to operate at the
                 highest hardware privilege level by write-protecting
                 MMU translations and de-privileging the untrusted part
                 of the kernel. Our implementation inherently enforces
                 kernel code integrity while still allowing dynamically
                 loaded kernel modules, thus defending against code
                 injection attacks. We also demonstrate that the nested
                 kernel architecture allows kernel developers to isolate
                 memory in ways not possible in monolithic kernels by
                 introducing write-mediation and write-logging services
                 to protect critical system data structures. Performance
                 of the nested kernel prototype shows modest overheads:
                 $ < 1 \% $ average for Apache and 2.7\% for kernel
                 compile. Overall, our results and experience show that
                 the nested kernel design can be retrofitted to existing
                 monolithic kernels, providing important security
                 benefits.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Tan:2015:DWS,
  author =       "Zhangxi Tan and Zhenghao Qian and Xi Chen and Krste
                 Asanovic and David Patterson",
  title =        "{DIABLO}: a Warehouse-Scale Computer Network Simulator
                 using {FPGAs}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "207--221",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694362",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Motivated by rapid software and hardware innovation in
                 warehouse-scale computing (WSC), we visit the problem
                 of warehouse-scale network design evaluation. A WSC is
                 composed of about 30 arrays or clusters, each of which
                 contains about 3000 servers, leading to a total of
                 about 100,000 servers per WSC. We found many prior
                 experiments have been conducted on relatively small
                 physical testbeds, and they often assume the workload
                 is static and that computations are only loosely
                 coupled with the adaptive networking stack. We present
                 a novel and cost-efficient FPGA-based evaluation
                 methodology, called Datacenter-In-A-Box at LOw cost
                 (DIABLO), which treats arrays as whole computers with
                 tightly integrated hardware and software. We have built
                 a 3,000-node prototype running the full WSC software
                 stack. Using our prototype, we have successfully
                 reproduced a few WSC phenomena, such as TCP Incast and
                 memcached request latency long tail, and found that
                 results do indeed change with both scale and with
                 version of the full software stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Hauswald:2015:SOE,
  author =       "Johann Hauswald and Michael A. Laurenzano and Yunqi
                 Zhang and Cheng Li and Austin Rovinski and Arjun
                 Khurana and Ronald G. Dreslinski and Trevor Mudge and
                 Vinicius Petrucci and Lingjia Tang and Jason Mars",
  title =        "{Sirius}: an Open End-to-End Voice and Vision Personal
                 Assistant and Its Implications for Future Warehouse
                 Scale Computers",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "223--238",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694347",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As user demand scales for intelligent personal
                 assistants (IPAs) such as Apple's Siri, Google's Google
                 Now, and Microsoft's Cortana, we are approaching the
                 computational limits of current datacenter
                 architectures. It is an open question how future server
                 architectures should evolve to enable this emerging
                 class of applications, and the lack of an open-source
                 IPA workload is an obstacle in addressing this
                 question. In this paper, we present the design of
                 Sirius, an open end-to-end IPA web-service application
                 that accepts queries in the form of voice and images,
                 and responds with natural language. We then use this
                 workload to investigate the implications of four points
                 in the design space of future accelerator-based server
                 architectures spanning traditional CPUs, GPUs, manycore
                 throughput co-processors, and FPGAs. To investigate
                 future server designs for Sirius, we decompose Sirius
                 into a suite of 7 benchmarks (Sirius Suite) comprising
                 the computationally intensive bottlenecks of Sirius. We
                 port Sirius Suite to a spectrum of accelerator
                 platforms and use the performance and power trade-offs
                 across these platforms to perform a total cost of
                 ownership (TCO) analysis of various server design
                 points. In our study, we find that accelerators are
                 critical for the future scalability of IPA services.
                 Our results show that GPU- and FPGA-accelerated servers
                 improve the query latency on average by 10x and 16x.
                 For a given throughput, GPU- and FPGA-accelerated
                 servers can reduce the TCO of datacenters by 2.6x and
                 1.4x, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Xu:2015:ALD,
  author =       "Chao Xu and Felix Xiaozhu Lin and Yuyang Wang and Lin
                 Zhong",
  title =        "Automated {OS}-level Device Runtime Power Management",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "239--252",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694360",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Non-CPU devices on a modern system-on-a-chip (SoC),
                 ranging from accelerators to I/O controllers, account
                 for a significant portion of the chip area. It is
                 therefore vital for system energy efficiency that idle
                 devices can enter a low-power state while still meeting
                 the performance expectation. This is called device
                 runtime Power Management (PM) for which individual
                 device drivers in commodity OSes are held responsible
                 today. Based on the observations of existing drivers
                 and their evolution, we consider it harmful to rely on
                 drivers for device runtime PM. This paper identifies
                 three pieces of information as essential to device
                 runtime PM, and shows that they can be obtained without
                 involving drivers, either by using a software-only
                 approach, or more efficiently, by adding one register
                 bit to each device. We thus suggest a structural change
                 to the current Linux runtime PM framework, replacing
                 the PM code in all applicable drivers with a single
                 kernel module called the central PM agent. Experimental
                 evaluations show that the central PM agent is just as
                 effective as hand-tuned driver PM code. The paper also
                 presents a tool called PowerAdvisor that simplifies
                 driver PM efforts under the current Linux runtime PM
                 framework. PowerAdvisor analyzes execution traces and
                 suggests where to insert PM calls in driver source
                 code. Despite being a best-effort tool, PowerAdvisor
                 not only reproduces hand-tuned PM code from stock
                 drivers, but also correctly suggests PM code never
                 known before. Overall, our experience shows that it is
                 promising to ultimately free driver developers from
                 manual PM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Goiri:2015:CTV,
  author =       "{\'I}{\~n}igo Goiri and Thu D. Nguyen and Ricardo
                 Bianchini",
  title =        "{CoolAir}: Temperature- and Variation-Aware Management
                 for Free-Cooled Datacenters",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "253--265",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694378",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite its benefits, free cooling may expose servers
                 to high absolute temperatures, wide temperature
                 variations, and high humidity when datacenters are
                 sited at certain locations. Prior research (in
                 non-free-cooled datacenters) has shown that high
                 temperatures and/or wide temporal temperature
                 variations can harm hardware reliability. In this
                 paper, we identify the runtime management strategies
                 required to limit absolute temperatures, temperature
                 variations, humidity, and cooling energy in free-cooled
                 datacenters. As the basis for our study, we propose
                 CoolAir, a system that embodies these strategies. Using
                 CoolAir and a real free-cooled datacenter prototype, we
                 show that effective management requires cooling
                 infrastructures that can act smoothly. In addition, we
                 show that CoolAir can tightly manage temperature and
                 significantly reduce temperature variation, often at a
                 lower cooling cost than existing free-cooled
                 datacenters. Perhaps most importantly, based on our
                 results, we derive several principles and lessons that
                 should guide the design of management systems for
                 free-cooled datacenters of any size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Mishra:2015:PGM,
  author =       "Nikita Mishra and Huazhe Zhang and John D. Lafferty
                 and Henry Hoffmann",
  title =        "A Probabilistic Graphical Model-based Approach for
                 Minimizing Energy Under Performance Constraints",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "267--281",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694373",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In many deployments, computer systems are
                 underutilized --- meaning that applications have
                 performance requirements that demand less than full
                 system capacity. Ideally, we would take advantage of
                 this under-utilization by allocating system resources
                 so that the performance requirements are met and energy
                 is minimized. This optimization problem is complicated
                 by the fact that the performance and power consumption
                 of various system configurations are often application
                 --- or even input --- dependent. Thus, practically,
                 minimizing energy for a performance constraint requires
                 fast, accurate estimations of application-dependent
                 performance and power tradeoffs. This paper
                 investigates machine learning techniques that enable
                 energy savings by learning Pareto-optimal power and
                 performance tradeoffs. Specifically, we propose LEO, a
                 probabilistic graphical model-based learning system
                 that provides accurate online estimates of an
                 application's power and performance as a function of
                 system configuration. We compare LEO to (1) offline
                 learning, (2) online learning, (3) a heuristic
                 approach, and (4) the true optimal solution. We find
                 that LEO produces the most accurate estimates and near
                 optimal energy savings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Pang:2015:MLL,
  author =       "Jun Pang and Chris Dwyer and Alvin R. Lebeck",
  title =        "More is Less, Less is More: Molecular-Scale Photonic
                 {NoC} Power Topologies",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "283--296",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694377",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Molecular-scale Network-on-Chip (mNoC) crossbars use
                 quantum dot LEDs as an on-chip light source, and
                 chromophores to provide optical signal filtering for
                 receivers. An mNoC reduces power consumption or enables
                 scaling to larger crossbars for a reduced energy budget
                 compared to current nanophotonic NoC crossbars. Since
                 communication latency is reduced by using a high-radix
                 crossbar, minimizing power consumption becomes a
                 primary design target. Conventional Single Writer
                 Multiple Reader (SWMR) photonic crossbar designs
                 broadcast all packets, and incur the commensurate
                 required power, even if only two nodes are
                 communicating. This paper introduces power topologies,
                 enabled by unique capabilities of mNoC technology, to
                 reduce overall interconnect power consumption. A power
                 topology corresponds to the logical connectivity
                 provided by a given power mode. Broadcast is one power
                 mode and it consumes the maximum power. Additional
                 power modes consume less power but allow a source to
                 communicate with only a statically defined, potentially
                 non-contiguous, subset of nodes. Overall interconnect
                 power is reduced if the more frequently communicating
                 nodes use modes that consume less power, while less
                 frequently communicating nodes use modes that consume
                 more power. We also investigate thread mapping
                 techniques to fully exploit power topologies. We
                 explore various mNoC power topologies with one, two and
                 four power modes for a radix-256 SWMR mNoC crossbar.
                 Our results show that the combination of power
                 topologies and intelligent thread mapping can reduce
                 total mNoC power by up to 51\% on average for a set of
                 12 SPLASH benchmarks. Furthermore performance is 10\%
                 better than conventional resonator-based photonic NoCs
                 and energy is reduced by 72\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Sridharan:2015:MEM,
  author =       "Vilas Sridharan and Nathan DeBardeleben and Sean
                 Blanchard and Kurt B. Ferreira and Jon Stearley and
                 John Shalf and Sudhanva Gurumurthi",
  title =        "Memory Errors in Modern Systems: The Good, The Bad,
                 and The Ugly",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "297--310",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694348",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Several recent publications have shown that hardware
                 faults in the memory subsystem are commonplace. These
                 faults are predicted to become more frequent in future
                 systems that contain orders of magnitude more DRAM and
                 SRAM than found in current memory subsystems. These
                 memory subsystems will need to provide resilience
                 techniques to tolerate these faults when deployed in
                 high-performance computing systems and data centers
                 containing tens of thousands of nodes. Therefore, it is
                 critical to understand the efficacy of current hardware
                 resilience techniques to determine whether they will be
                 suitable for future systems. In this paper, we present
                 a study of DRAM and SRAM faults and errors from the
                 field. We use data from two leadership-class
                 high-performance computer systems to analyze the
                 reliability impact of hardware resilience schemes that
                 are deployed in current systems. Our study has several
                 key findings about the efficacy of many currently
                 deployed reliability techniques such as DRAM ECC, DDR
                 address/command parity, and SRAM ECC and parity. We
                 also perform a methodological study, and find that
                 counting errors instead of faults, a common practice
                 among researchers and data center operators, can lead
                 to incorrect conclusions about system reliability.
                 Finally, we use our data to project the needs of future
                 large-scale systems. We find that SRAM faults are
                 unlikely to pose a significantly larger reliability
                 threat in the future, while DRAM faults will be a major
                 concern and stronger DRAM resilience schemes will be
                 needed to maintain acceptable failure rates similar to
                 those found on today's systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Yetim:2015:CMC,
  author =       "Yavuz Yetim and Sharad Malik and Margaret Martonosi",
  title =        "{CommGuard}: Mitigating Communication Errors in
                 Error-Prone Parallel Execution",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "311--323",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694354",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As semiconductor technology scales towards
                 ever-smaller transistor sizes, hardware fault rates are
                 increasing. Since important application classes (e.g.,
                 multimedia, streaming workloads) are
                 data-error-tolerant, recent research has proposed
                 techniques that seek to save energy or improve yield by
                 exploiting error tolerance at the
                 architecture/microarchitecture level. Even seemingly
                 error-tolerant applications, however, will crash or
                 hang due to control-flow/memory addressing errors. In
                 parallel computation, errors involving inter-thread
                 communication can have equally catastrophic effects.
                 Our work explores techniques that mitigate the impact
                 of potentially catastrophic errors in parallel
                 computation, while still garnering power, cost, or
                 yield benefits from data error tolerance. Our proposed
                 CommGuard solution uses FSM-based checkers to pad and
                 discard data in order to maintain semantic alignment
                 between program control flow and the data communicated
                 between processors. CommGuard techniques are low
                 overhead and they exploit application information
                 already provided by some parallel programming languages
                 (e.g. StreamIt). By converting potentially catastrophic
                 communication errors into potentially tolerable data
                 errors, CommGuard allows important streaming
                 applications like JPEG and MP3 decoding to execute
                 without crashing and to sustain good output quality,
                 even for errors as frequent as every 500 \mu s.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Kim:2015:DEF,
  author =       "Dohyeong Kim and Yonghwi Kwon and William N. Sumner
                 and Xiangyu Zhang and Dongyan Xu",
  title =        "Dual Execution for On-the-Fly Fine Grained Execution
                 Comparison",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "325--338",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694394",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Execution comparison has many applications in
                 debugging, malware analysis, software feature
                 identification, and intrusion detection. Existing
                 comparison techniques have various limitations. Some
                 can only compare at the system event level and require
                 executions to take the same input. Some require storing
                 instruction traces that are very space-consuming and
                 have difficulty dealing with non-determinism. In this
                 paper, we propose a novel dual execution technique that
                 allows on-the-fly comparison at the instruction level.
                 Only differences between the executions are recorded.
                 It allows executions to proceed in a coupled mode such
                 that they share the same input sequence with the same
                 timing, reducing nondeterminism. It also allows them to
                 proceed in a decoupled mode such that the user can
                 interact with each one differently. Decoupled
                 executions can be recoupled to share the same future
                 inputs and facilitate further comparison. We have
                 implemented a prototype and applied it to identifying
                 functional components for reuse, comparative debugging
                 with new GDB primitives, and understanding real world
                 regression failures. Our results show that dual
                 execution is a critical enabling technique for
                 execution comparison.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Hosek:2015:VUE,
  author =       "Petr Hosek and Cristian Cadar",
  title =        "{VARAN} the Unbelievable: an Efficient {$N$}-version
                 Execution Framework",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "339--353",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694390",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the widespread availability of multi-core
                 processors, running multiple diversified variants or
                 several different versions of an application in
                 parallel is becoming a viable approach for increasing
                 the reliability and security of software systems. The
                 key component of such N-version execution (NVX) systems
                 is a runtime monitor that enables the execution of
                 multiple versions in parallel. Unfortunately, existing
                 monitors impose either a large performance overhead or
                 rely on intrusive kernel-level changes. Moreover, none
                 of the existing solutions scales well with the number
                 of versions, since the runtime monitor acts as a
                 performance bottleneck. In this paper, we introduce
                 Varan, an NVX framework that combines selective binary
                 rewriting with a novel event-streaming architecture to
                 significantly reduce performance overhead and scale
                 well with the number of versions, without relying on
                 intrusive kernel modifications. Our evaluation shows
                 that Varan can run NVX systems based on popular C10k
                 network servers with only a modest performance
                 overhead, and can be effectively used to increase
                 software reliability using techniques such as
                 transparent failover, live sanitization and
                 multi-revision execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Malka:2015:REI,
  author =       "Moshe Malka and Nadav Amit and Muli Ben-Yehuda and Dan
                 Tsafrir",
  title =        "{rIOMMU}: Efficient {IOMMU} for {I/O} Devices that
                 Employ Ring Buffers",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "355--368",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694355",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The IOMMU allows the OS to encapsulate I/O devices in
                 their own virtual memory spaces, thus restricting their
                 DMAs to specific memory pages. The OS uses the IOMMU to
                 protect itself against buggy drivers and
                 malicious/errant devices. But the added protection
                 comes at a cost, degrading the throughput of
                 I/O-intensive workloads by up to an order of magnitude.
                 This cost has motivated system designers to trade off
                 some safety for performance, e.g., by leaving stale
                 information in the IOTLB for a while so as to amortize
                 costly invalidations. We observe that high-bandwidth
                 devices---like network and PCIe SSD
                 controllers---interact with the OS via circular ring
                 buffers that induce a sequential, predictable workload.
                 We design a ring IOMMU (rIOMMU) that leverages this
                 characteristic by replacing the virtual memory page
                 table hierarchy with a circular, flat table. A flat
                 table is adequately supported by exactly one IOTLB
                 entry, making every new translation an implicit
                 invalidation of the former and thus requiring explicit
                 invalidations only at the end of I/O bursts. Using
                 standard networking benchmarks, we show that rIOMMU
                 provides up to 7.56x higher throughput relative to the
                 baseline IOMMU, and that it is within 0.77--1.00x the
                 throughput of a system without IOMMU protection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Liu:2015:PPM,
  author =       "Daofu Liu and Tianshi Chen and Shaoli Liu and Jinhong
                 Zhou and Shengyuan Zhou and Olivier Teman and Xiaobing
                 Feng and Xuehai Zhou and Yunji Chen",
  title =        "{PuDianNao}: a Polyvalent Machine Learning
                 Accelerator",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "369--381",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694358",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Machine Learning (ML) techniques are pervasive tools
                 in various emerging commercial applications, but have
                 to be accommodated by powerful computer systems to
                 process very large data. Although general-purpose CPUs
                 and GPUs have provided straightforward solutions, their
                 energy-efficiencies are limited due to their excessive
                 supports for flexibility. Hardware accelerators may
                 achieve better energy-efficiencies, but each
                 accelerator often accommodates only a single ML
                 technique (family). According to the famous
                 No-Free-Lunch theorem in the ML domain, however, an ML
                 technique performs well on a dataset may perform poorly
                 on another dataset, which implies that such accelerator
                 may sometimes lead to poor learning accuracy. Even if
                 regardless of the learning accuracy, such accelerator
                 can still become inapplicable simply because the
                 concrete ML task is altered, or the user chooses
                 another ML technique. In this study, we present an ML
                 accelerator called PuDianNao, which accommodates seven
                 representative ML techniques, including k-means,
                 k-nearest neighbors, naive Bayes, support vector
                 machine, linear regression, classification tree, and
                 deep neural network. Benefited from our thorough
                 analysis on computational primitives and locality
                 properties of different ML techniques, PuDianNao can
                 perform up to 1056 GOP/s (e.g., additions and
                 multiplications) in an area of 3.51 mm^2, and consumes
                 596 mW only. Compared with the NVIDIA K20M GPU (28nm
                 process), PuDianNao (65nm process) is 1.20x faster, and
                 can reduce the energy by 128.41x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Goiri:2015:ABA,
  author =       "Inigo Goiri and Ricardo Bianchini and Santosh
                 Nagarakatte and Thu D. Nguyen",
  title =        "{ApproxHadoop}: Bringing Approximations to {MapReduce}
                 Frameworks",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "383--397",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694351",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose and evaluate a framework for creating and
                 running approximation-enabled MapReduce programs.
                 Specifically, we propose approximation mechanisms that
                 fit naturally into the MapReduce paradigm, including
                 input data sampling, task dropping, and accepting and
                 running a precise and a user-defined approximate
                 version of the MapReduce code. We then show how to
                 leverage statistical theories to compute error bounds
                 for popular classes of MapReduce programs when
                 approximating with input data sampling and/or task
                 dropping. We implement the proposed mechanisms and
                 error bound estimations in a prototype system called
                 ApproxHadoop. Our evaluation uses MapReduce
                 applications from different domains, including data
                 analytics, scientific computing, video encoding, and
                 machine learning. Our results show that ApproxHadoop
                 can significantly reduce application execution time
                 and/or energy consumption when the user is willing to
                 tolerate small errors. For example, ApproxHadoop can
                 reduce runtimes by up to 32x when the user can tolerate
                 an error of 1\% with 95\% confidence. We conclude that
                 our framework and system can make approximation easily
                 accessible to many application domains using the
                 MapReduce model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Ringenburg:2015:MDQ,
  author =       "Michael Ringenburg and Adrian Sampson and Isaac
                 Ackerman and Luis Ceze and Dan Grossman",
  title =        "Monitoring and Debugging the Quality of Results in
                 Approximate Programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "399--411",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694365",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy efficiency is a key concern in the design of
                 modern computer systems. One promising approach to
                 energy-efficient computation, approximate computing,
                 trades off output accuracy for significant gains in
                 energy efficiency. However, debugging the actual cause
                 of output quality problems in approximate programs is
                 challenging. This paper presents dynamic techniques to
                 debug and monitor the quality of approximate
                 computations. We propose both offline debugging tools
                 that instrument code to determine the key sources of
                 output degradation and online approaches that monitor
                 the quality of deployed applications. We present two
                 offline debugging techniques and three online
                 monitoring mechanisms. The first offline tool
                 identifies correlations between output quality and the
                 execution of individual approximate operations. The
                 second tracks approximate operations that flow into a
                 particular value. Our online monitoring mechanisms are
                 complementary approaches designed for detecting quality
                 problems in deployed applications, while still
                 maintaining the energy savings from approximation. We
                 present implementations of our techniques and describe
                 their usage with seven applications. Our online
                 monitors control output quality while still maintaining
                 significant energy efficiency gains, and our offline
                 tools provide new insights into the effects of
                 approximation on output quality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Banavar:2015:WEC,
  author =       "Guruduth Banavar",
  title =        "{Watson} and the Era of Cognitive Computing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "413--413",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694376",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the last decade, the availability of massive
                 amounts of new data, and the development of new machine
                 learning technologies, have augmented reasoning systems
                 to give rise to a new class of computing systems. These
                 ``Cognitive Systems'' learn from data, reason from
                 models, and interact naturally with us, to perform
                 complex tasks better than either humans or machines can
                 do by themselves. In essence, cognitive systems help us
                 perform like the best by penetrating the complexity of
                 big data and leverage the power of models. One of the
                 first cognitive systems, called Watson, demonstrated
                 through a Jeopardy! exhibition match, that it was
                 capable of answering complex factoid questions as
                 effectively as the world's champions. Follow-on
                 cognitive systems perform other tasks, such as
                 discovery, reasoning, and multi-modal understanding in
                 a variety of domains, such as healthcare, insurance,
                 and education. We believe such cognitive systems will
                 transform every industry and our everyday life for the
                 better. In this talk, I will give an overview of the
                 applications, the underlying capabilities, and some of
                 the key challenges, of cognitive systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Stewart:2015:ZDW,
  author =       "Gordon Stewart and Mahanth Gowda and Geoffrey Mainland
                 and Bozidar Radunovic and Dimitrios Vytiniotis and
                 Cristina Luengo Agullo",
  title =        "{Ziria}: a {DSL} for Wireless Systems Programming",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "415--428",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694368",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software-defined radio (SDR) brings the flexibility of
                 software to wireless protocol design, promising an
                 ideal platform for innovation and rapid protocol
                 deployment. However, implementing modern wireless
                 protocols on existing SDR platforms often requires
                 careful hand-tuning of low-level code, which can
                 undermine the advantages of software. Ziria is a new
                 domain-specific language (DSL) that offers programming
                 abstractions suitable for wireless physical (PHY) layer
                 tasks while emphasizing the pipeline reconfiguration
                 aspects of PHY programming. The Ziria compiler
                 implements a rich set of specialized optimizations,
                 such as lookup table generation and pipeline fusion. We
                 also offer a novel --- due to pipeline reconfiguration
                 --- algorithm to optimize the data widths of
                 computations in Ziria pipelines. We demonstrate the
                 programming flexibility of Ziria and the performance of
                 the generated code through a detailed evaluation of a
                 line-rate Ziria WiFi 802.11a/g implementation that is
                 on par and in many cases outperforms a hand-tuned
                 state-of-the-art C++ implementation on commodity
                 CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Mullapudi:2015:PAO,
  author =       "Ravi Teja Mullapudi and Vinay Vasista and Uday
                 Bondhugula",
  title =        "{PolyMage}: Automatic Optimization for Image
                 Processing Pipelines",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "429--443",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694364",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents the design and implementation of
                 PolyMage, a domain-specific language and compiler for
                 image processing pipelines. An image processing
                 pipeline can be viewed as a graph of interconnected
                 stages which process images successively. Each stage
                 typically performs one of point-wise, stencil,
                 reduction or data-dependent operations on image pixels.
                 Individual stages in a pipeline typically exhibit
                 abundant data parallelism that can be exploited with
                 relative ease. However, the stages also require high
                 memory bandwidth preventing effective utilization of
                 parallelism available on modern architectures. For
                 applications that demand high performance, the
                 traditional options are to use optimized libraries like
                 OpenCV or to optimize manually. While using libraries
                 precludes optimization across library routines, manual
                 optimization accounting for both parallelism and
                 locality is very tedious. The focus of our system,
                 PolyMage, is on automatically generating
                 high-performance implementations of image processing
                 pipelines expressed in a high-level declarative
                 language. Our optimization approach primarily relies on
                 the transformation and code generation capabilities of
                 the polyhedral compiler framework. To the best of our
                 knowledge, this is the first model-driven compiler for
                 image processing pipelines that performs complex
                 fusion, tiling, and storage optimization automatically.
                 Experimental results on a modern multicore system show
                 that the performance achieved by our automatic approach
                 is up to 1.81x better than that achieved through manual
                 tuning in Halide, a state-of-the-art language and
                 compiler for image processing pipelines. For a camera
                 raw image processing pipeline, our performance is
                 comparable to that of a hand-tuned implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Heckey:2015:CMC,
  author =       "Jeff Heckey and Shruti Patil and Ali JavadiAbhari and
                 Adam Holmes and Daniel Kudrow and Kenneth R. Brown and
                 Diana Franklin and Frederic T. Chong and Margaret
                 Martonosi",
  title =        "Compiler Management of Communication and Parallelism
                 for Quantum Computation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "445--456",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694357",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Quantum computing (QC) offers huge promise to
                 accelerate a range of computationally intensive
                 benchmarks. Quantum computing is limited, however, by
                 the challenges of decoherence: i.e., a quantum state
                 can only be maintained for short windows of time before
                 it decoheres. While quantum error correction codes can
                 protect against decoherence, fast execution time is the
                 best defense against decoherence, so efficient
                 architectures and effective scheduling algorithms are
                 necessary. This paper proposes the Multi-SIMD QC
                 architecture and then proposes and evaluates effective
                 schedulers to map benchmark descriptions onto
                 Multi-SIMD architectures. The Multi-SIMD model consists
                 of a small number of SIMD regions, each of which may
                 support operations on up to thousands of qubits per
                 cycle. Efficient Multi-SIMD operation requires
                 efficient scheduling. This work develops schedulers to
                 reduce communication requirements of qubits between
                 operating regions, while also improving parallelism.We
                 find that communication to global memory is a dominant
                 cost in QC. We also note that many quantum benchmarks
                 have long serial operation paths (although each
                 operation may be data parallel). To exploit this
                 characteristic, we introduce Longest-Path-First
                 Scheduling (LPFS) which pins operations to SIMD regions
                 to keep data in-place and reduce communication to
                 memory. The use of small, local scratchpad memories
                 also further reduces communication. Our results show a
                 3\% to 308\% improvement for LPFS over conventional
                 scheduling algorithms, and an additional 3\% to 64\%
                 improvement using scratchpad memories. Our work is the
                 most comprehensive software-to-quantum toolflow
                 published to date, with efficient and practical
                 scheduling techniques that reduce communication and
                 increase parallelism for full-scale quantum code
                 executing up to a trillion quantum gate operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Hassaan:2015:KDG,
  author =       "Muhammad Amber Hassaan and Donald D. Nguyen and Keshav
                 K. Pingali",
  title =        "Kinetic Dependence Graphs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "457--471",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694363",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Task graphs or dependence graphs are used in runtime
                 systems to schedule tasks for parallel execution. In
                 problem domains such as dense linear algebra and signal
                 processing, dependence graphs can be generated from a
                 program by static analysis. However, in emerging
                 problem domains such as graph analytics, the set of
                 tasks and dependences between tasks in a program are
                 complex functions of runtime values and cannot be
                 determined statically. In this paper, we introduce a
                 novel approach for exploiting parallelism in such
                 programs. This approach is based on a data structure
                 called the kinetic dependence graph (KDG), which
                 consists of a dependence graph together with update
                 rules that incrementally update the graph to reflect
                 changes in the dependence structure whenever a task is
                 completed. We have implemented a simple programming
                 model that allows programmers to write these
                 applications at a high level of abstraction, and a
                 runtime within the Galois system [15] that builds the
                 KDG automatically and executes the program in parallel.
                 On a suite of programs that are difficult to
                 parallelize otherwise, we have obtained speedups of up
                 to 33 on 40 cores, out-performing third-party
                 implementations in many cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Sidiroglou-Douskos:2015:TAI,
  author =       "Stelios Sidiroglou-Douskos and Eric Lahtinen and
                 Nathan Rittenhouse and Paolo Piselli and Fan Long and
                 Deokhwan Kim and Martin Rinard",
  title =        "Targeted Automatic Integer Overflow Discovery Using
                 Goal-Directed Conditional Branch Enforcement",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "473--486",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694389",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new technique and system, DIODE, for
                 auto- matically generating inputs that trigger
                 overflows at memory allocation sites. DIODE is designed
                 to identify relevant sanity checks that inputs must
                 satisfy to trigger overflows at target memory
                 allocation sites, then generate inputs that satisfy
                 these sanity checks to successfully trigger the
                 overflow. DIODE works with off-the-shelf, production
                 x86 binaries. Our results show that, for our benchmark
                 set of applications, and for every target memory
                 allocation site exercised by our seed inputs (which the
                 applications process correctly with no overflows),
                 either (1) DIODE is able to generate an input that
                 triggers an overflow at that site or (2) there is no
                 input that would trigger an overflow for the observed
                 target expression at that site.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Dhawan:2015:ASS,
  author =       "Udit Dhawan and Catalin Hritcu and Raphael Rubin and
                 Nikos Vasilakis and Silviu Chiricescu and Jonathan M.
                 Smith and Thomas F. {Knight, Jr.} and Benjamin C.
                 Pierce and Andre DeHon",
  title =        "Architectural Support for Software-Defined Metadata
                 Processing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "487--502",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694383",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Optimized hardware for propagating and checking
                 software-programmable metadata tags can achieve low
                 runtime overhead. We generalize prior work on hardware
                 tagging by considering a generic architecture that
                 supports software-defined policies over metadata of
                 arbitrary size and complexity; we introduce several
                 novel microarchitectural optimizations that keep the
                 overhead of this rich processing low. Our model thus
                 achieves the efficiency of previous hardware-based
                 approaches with the flexibility of the software-based
                 ones. We demonstrate this by using it to enforce four
                 diverse safety and security policies---spatial and
                 temporal memory safety, taint tracking, control-flow
                 integrity, and code and data separation---plus a
                 composite policy that enforces all of them
                 simultaneously. Experiments on SPEC CPU2006 benchmarks
                 with a PUMP-enhanced RISC processor show modest impact
                 on runtime (typically under 10\%) and power ceiling
                 (less than 10\%), in return for some increase in energy
                 usage (typically under 60\%) and area for on-chip
                 memory structures (110\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Zhang:2015:HDL,
  author =       "Danfeng Zhang and Yao Wang and G. Edward Suh and
                 Andrew C. Myers",
  title =        "A Hardware Design Language for Timing-Sensitive
                 Information-Flow Security",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "503--516",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694372",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Information security can be compromised by leakage via
                 low-level hardware features. One recently prominent
                 example is cache probing attacks, which rely on timing
                 channels created by caches. We introduce a hardware
                 design language, SecVerilog, which makes it possible to
                 statically analyze information flow at the hardware
                 level. With SecVerilog, systems can be built with
                 verifiable control of timing channels and other
                 information channels. SecVerilog is Verilog, extended
                 with expressive type annotations that enable precise
                 reasoning about information flow. It also comes with
                 rigorous formal assurance: we prove that SecVerilog
                 enforces timing-sensitive noninterference and thus
                 ensures secure information flow. By building a secure
                 MIPS processor and its caches, we demonstrate that
                 SecVerilog makes it possible to build complex hardware
                 designs with verified security, yet with low overhead
                 in time, space, and HW designer effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Hicks:2015:SLR,
  author =       "Matthew Hicks and Cynthia Sturton and Samuel T. King
                 and Jonathan M. Smith",
  title =        "{SPECS}: a Lightweight Runtime Mechanism for
                 Protecting Software from Security-Critical Processor
                 Bugs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "517--529",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694366",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Processor implementation errata remain a problem, and
                 worse, a subset of these bugs are security-critical. We
                 classified 7 years of errata from recent commercial
                 processors to understand the magnitude and severity of
                 this problem, and found that of 301 errata analyzed, 28
                 are security-critical. We propose the SECURITY-CRITICAL
                 PROCESSOR ER- RATA CATCHING SYSTEM (SPECS) as a
                 low-overhead solution to this problem. SPECS employs a
                 dynamic verification strategy that is made lightweight
                 by limiting protection to only security-critical
                 processor state. As a proof-of- concept, we implement a
                 hardware prototype of SPECS in an open source
                 processor. Using this prototype, we evaluate SPECS
                 against a set of 14 bugs inspired by the types of
                 security-critical errata we discovered in the
                 classification phase. The evaluation shows that SPECS
                 is 86\% effective as a defense when deployed using only
                 ISA-level state; incurs less than 5\% area and power
                 overhead; and has no software run-time overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Duan:2015:AMF,
  author =       "Yuelu Duan and Nima Honarmand and Josep Torrellas",
  title =        "Asymmetric Memory Fences: Optimizing Both Performance
                 and Implementability",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "531--543",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694388",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There have been several recent efforts to improve the
                 performance of fences. The most aggressive designs
                 allow post-fence accesses to retire and complete before
                 the fence completes. Unfortunately, such designs
                 present implementation difficulties due to their
                 reliance on global state and structures. This paper's
                 goal is to optimize both the performance and the
                 implementability of fences. We start-off with a design
                 like the most aggressive ones but without the global
                 state. We call it Weak Fence or wF. Since the
                 concurrent execution of multiple wFs can deadlock, we
                 combine wFs with a conventional fence (i.e., Strong
                 Fence or sF) for the less performance-critical
                 thread(s). We call the result an Asymmetric fence
                 group. We also propose a taxonomy of Asymmetric fence
                 groups under TSO. Compared to past aggressive fences,
                 Asymmetric fence groups both are substantially easier
                 to implement and have higher average performance. The
                 two main designs presented (WS+ and W+) speed-up
                 workloads under TSO by an average of 13\% and 21\%,
                 respectively, over conventional fences.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Sung:2015:DES,
  author =       "Hyojin Sung and Sarita V. Adve",
  title =        "{DeNovoSync}: Efficient Support for Arbitrary
                 Synchronization without Writer-Initiated
                 Invalidations",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "545--559",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694356",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Current shared-memory hardware is complex and
                 inefficient. Prior work on the DeNovo coherence
                 protocol showed that disciplined shared-memory
                 programming models can enable more complexity-,
                 performance-, and energy-efficient hardware than the
                 state-of-the-art MESI protocol. DeNovo, however,
                 severely restricted the synchronization constructs an
                 application can support. This paper proposes
                 DeNovoSync, a technique to support arbitrary
                 synchronization in DeNovo. The key challenge is that
                 DeNovo exploits race-freedom to use reader-initiated
                 local self-invalidations (instead of conventional
                 writer-initiated remote cache invalidations) to ensure
                 coherence. Synchronization accesses are inherently racy
                 and not directly amenable to self-invalidations.
                 DeNovoSync addresses this challenge using a novel
                 combination of registration of all synchronization
                 reads with a judicious hardware backoff to limit
                 unnecessary registrations. For a wide variety of
                 synchronization constructs and applications, compared
                 to MESI, DeNovoSync shows comparable or up to 22\%
                 lower execution time and up to 58\% lower network
                 traffic, enabling DeNovo's advantages for a much
                 broader class of software than previously possible.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Sengupta:2015:HSD,
  author =       "Aritra Sengupta and Swarnendu Biswas and Minjia Zhang
                 and Michael D. Bond and Milind Kulkarni",
  title =        "Hybrid Static-Dynamic Analysis for Statically Bounded
                 Region Serializability",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "561--575",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694379",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data races are common. They are difficult to detect,
                 avoid, or eliminate, and programmers sometimes
                 introduce them intentionally. However, shared-memory
                 programs with data races have unexpected, erroneous
                 behaviors. Intentional and unintentional data races
                 lead to atomicity and sequential consistency (SC)
                 violations, and they make it more difficult to
                 understand, test, and verify software. Existing
                 approaches for providing stronger guarantees for racy
                 executions add high run-time overhead and/or rely on
                 custom hardware. This paper shows how to provide
                 stronger semantics for racy programs while providing
                 relatively good performance on commodity systems. A
                 novel hybrid static--dynamic analysis called
                 \emph{EnfoRSer} provides end-to-end support for a
                 memory model called \emph{statically bounded region
                 serializability} (SBRS) that is not only stronger than
                 weak memory models but is strictly stronger than SC.
                 EnfoRSer uses static compiler analysis to transform
                 regions, and dynamic analysis to detect and resolve
                 conflicts at run time. By demonstrating commodity
                 support for a reasonably strong memory model with
                 reasonable overheads, we show its potential as an
                 always-on execution model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Alglave:2015:GCW,
  author =       "Jade Alglave and Mark Batty and Alastair F. Donaldson
                 and Ganesh Gopalakrishnan and Jeroen Ketema and Daniel
                 Poetzl and Tyler Sorensen and John Wickerson",
  title =        "{GPU} Concurrency: Weak Behaviours and Programming
                 Assumptions",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "577--591",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694391",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrency is pervasive and perplexing, particularly
                 on graphics processing units (GPUs). Current
                 specifications of languages and hardware are
                 inconclusive; thus programmers often rely on folklore
                 assumptions when writing software. To remedy this state
                 of affairs, we conducted a large empirical study of the
                 concurrent behaviour of deployed GPUs. Armed with
                 litmus tests (i.e. short concurrent programs), we
                 questioned the assumptions in programming guides and
                 vendor documentation about the guarantees provided by
                 hardware. We developed a tool to generate thousands of
                 litmus tests and run them under stressful workloads. We
                 observed a litany of previously elusive weak
                 behaviours, and exposed folklore beliefs about GPU
                 programming---often supported by official
                 tutorials---as false. As a way forward, we propose a
                 model of Nvidia GPU hardware, which correctly models
                 every behaviour witnessed in our experiments. The model
                 is a variant of SPARC Relaxed Memory Order (RMO),
                 structured following the GPU concurrency hierarchy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Park:2015:CCP,
  author =       "Jason Jong Kyu Park and Yongjun Park and Scott
                 Mahlke",
  title =        "{Chimera}: Collaborative Preemption for Multitasking
                 on a Shared {GPU}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "593--606",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694346",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The demand for multitasking on graphics processing
                 units (GPUs) is constantly increasing as they have
                 become one of the default components on modern computer
                 systems along with traditional processors (CPUs).
                 Preemptive multitasking on CPUs has been primarily
                 supported through context switching. However, the same
                 preemption strategy incurs substantial overhead due to
                 the large context in GPUs. The overhead comes in two
                 dimensions: a preempting kernel suffers from a long
                 preemption latency, and the system throughput is wasted
                 during the switch. Without precise control over the
                 large preemption overhead, multitasking on GPUs has
                 little use for applications with strict latency
                 requirements. In this paper, we propose Chimera, a
                 collaborative preemption approach that can precisely
                 control the overhead for multitasking on GPUs. Chimera
                 first introduces streaming multiprocessor (SM)
                 flushing, which can instantly preempt an SM by
                 detecting and exploiting idempotent execution. Chimera
                 utilizes flushing collaboratively with two previously
                 proposed preemption techniques for GPUs, namely context
                 switching and draining to minimize throughput overhead
                 while achieving a required preemption latency.
                 Evaluations show that Chimera violates the deadline for
                 only 0.2\% of preemption requests when a 15us
                 preemption latency constraint is used. For
                 multi-programmed workloads, Chimera can improve the
                 average normalized turnaround time by 5.5x, and system
                 throughput by 12.2\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Agarwal:2015:PPS,
  author =       "Neha Agarwal and David Nellans and Mark Stephenson and
                 Mike O'Connor and Stephen W. Keckler",
  title =        "Page Placement Strategies for {GPUs} within
                 Heterogeneous Memory Systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "607--618",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694381",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Systems from smartphones to supercomputers are
                 increasingly heterogeneous, being composed of both CPUs
                 and GPUs. To maximize cost and energy efficiency, these
                 systems will increasingly use globally-addressable
                 heterogeneous memory systems, making choices about
                 memory page placement critical to performance. In this
                 work we show that current page placement policies are
                 not sufficient to maximize GPU performance in these
                 heterogeneous memory systems. We propose two new page
                 placement policies that improve GPU performance: one
                 application agnostic and one using application profile
                 information. Our application agnostic policy,
                 bandwidth-aware (BW-AWARE) placement, maximizes GPU
                 throughput by balancing page placement across the
                 memories based on the aggregate memory bandwidth
                 available in a system. Our simulation-based results
                 show that BW-AWARE placement outperforms the existing
                 Linux INTERLEAVE and LOCAL policies by 35\% and 18\% on
                 average for GPU compute workloads. We build upon
                 BW-AWARE placement by developing a compiler-based
                 profiling mechanism that provides programmers with
                 information about GPU application data structure access
                 patterns. Combining this information with simple
                 program-annotated hints about memory placement, our
                 hint-based page placement approach performs within 90\%
                 of oracular page placement on average, largely
                 mitigating the need for costly dynamic page tracking
                 and migration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Zhao:2015:FPS,
  author =       "Zhijia Zhao and Xipeng Shen",
  title =        "On-the-Fly Principled Speculation for {FSM}
                 Parallelization",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "619--630",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694369",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Finite State Machine (FSM) is the backbone of an
                 important class of applications in many domains. Its
                 parallelization has been extremely difficult due to
                 inherent strong dependences in the computation.
                 Recently, principled speculation shows good promise to
                 solve the problem. However, the reliance on offline
                 training makes the approach inconvenient to adopt and
                 hard to apply to many practical FSM applications, which
                 often deal with a large variety of inputs different
                 from training inputs. This work presents an assembly of
                 techniques that completely remove the needs for offline
                 training. The techniques include a set of theoretical
                 results on inherent properties of FSMs, and two newly
                 designed dynamic optimizations for efficient FSM
                 characterization. The new techniques, for the first
                 time, make principle speculation applicable on the fly,
                 and enables swift, automatic configuration of
                 speculative parallelizations to best suit a given FSM
                 and its current input. They eliminate the fundamental
                 barrier for practical adoption of principle speculation
                 for FSM parallelization. Experiments show that the new
                 techniques give significantly higher speedups for some
                 difficult FSM applications in the presence of input
                 changes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{David:2015:ACS,
  author =       "Tudor David and Rachid Guerraoui and Vasileios
                 Trigonakis",
  title =        "Asynchronized Concurrency: The Secret to Scaling
                 Concurrent Search Data Structures",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "631--644",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694359",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce ``asynchronized concurrency (ASCY),'' a
                 paradigm consisting of four complementary programming
                 patterns. ASCY calls for the design of concurrent
                 search data structures (CSDSs) to resemble that of
                 their sequential counterparts. We argue that ASCY leads
                 to implementations which are portably scalable: they
                 scale across different types of hardware platforms,
                 including single and multi-socket ones, for various
                 classes of workloads, such as read-only and read-write,
                 and according to different performance metrics,
                 including throughput, latency, and energy. We
                 substantiate our thesis through the most exhaustive
                 evaluation of CSDSs to date, involving 6 platforms, 22
                 state-of-the-art CSDS algorithms, 10 re-engineered
                 state-of-the-art CSDS algorithms following the ASCY
                 patterns, and 2 new CSDS algorithms designed with ASCY
                 in mind. We observe up to 30\% improvements in
                 throughput in the re-engineered algorithms, while our
                 new algorithms out-perform the state-of-the-art
                 alternatives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Bhatotia:2015:ITL,
  author =       "Pramod Bhatotia and Pedro Fonseca and Umut A. Acar and
                 Bj{\"o}rn B. Brandenburg and Rodrigo Rodrigues",
  title =        "{iThreads}: a Threading Library for Parallel
                 Incremental Computation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "645--659",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694371",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Incremental computation strives for efficient
                 successive runs of applications by re-executing only
                 those parts of the computation that are affected by a
                 given input change instead of recomputing everything
                 from scratch. To realize these benefits automatically,
                 we describe iThreads, a threading library for parallel
                 incremental computation. iThreads supports unmodified
                 shared-memory multithreaded programs: it can be used as
                 a replacement for pthreads by a simple exchange of
                 dynamically linked libraries, without even recompiling
                 the application code. To enable such an interface, we
                 designed algorithms and an implementation to operate at
                 the compiled binary code level by leveraging
                 MMU-assisted memory access tracking and process-based
                 thread isolation. Our evaluation on a multicore
                 platform using applications from the PARSEC and Phoenix
                 benchmarks and two case-studies shows significant
                 performance gains.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Gidra:2015:NGC,
  author =       "Lokesh Gidra and Ga{\"e}l Thomas and Julien Sopena and
                 Marc Shapiro and Nhan Nguyen",
  title =        "{NumaGiC}: a Garbage Collector for Big Data on Big
                 {NUMA} Machines",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "661--673",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694361",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "On contemporary cache-coherent Non-Uniform Memory
                 Access (ccNUMA) architectures, applications with a
                 large memory footprint suffer from the cost of the
                 garbage collector (GC), because, as the GC scans the
                 reference graph, it makes many remote memory accesses,
                 saturating the interconnect between memory nodes. We
                 address this problem with NumaGiC, a GC with a
                 mostly-distributed design. In order to maximise memory
                 access locality during collection, a GC thread avoids
                 accessing a different memory node, instead notifying a
                 remote GC thread with a message; nonetheless, NumaGiC
                 avoids the drawbacks of a pure distributed design,
                 which tends to decrease parallelism. We compare NumaGiC
                 with Parallel Scavenge and NAPS on two different ccNUMA
                 architectures running on the Hotspot Java Virtual
                 Machine of OpenJDK 7. On Spark and Neo4j, two
                 industry-strength analytics applications, with heap
                 sizes ranging from 160GB to 350GB, and on SPECjbb2013
                 and SPECjbb2005, our gc improves overall performance by
                 up to 45\% over NAPS (up to 94\% over Parallel
                 Scavenge), and increases the performance of the
                 collector itself by up to 3.6x over NAPS (up to 5.4x
                 over Parallel Scavenge).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Nguyen:2015:FCR,
  author =       "Khanh Nguyen and Kai Wang and Yingyi Bu and Lu Fang
                 and Jianfei Hu and Guoqing Xu",
  title =        "{FACADE}: a Compiler and Runtime for (Almost)
                 Object-Bounded Big Data Applications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "675--690",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694345",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The past decade has witnessed the increasing demands
                 on data-driven business intelligence that led to the
                 proliferation of data-intensive applications. A managed
                 object-oriented programming language such as Java is
                 often the developer's choice for implementing such
                 applications, due to its quick development cycle and
                 rich community resource. While the use of such
                 languages makes programming easier, their automated
                 memory management comes at a cost. When the managed
                 runtime meets Big Data, this cost is significantly
                 magnified and becomes a scalability-prohibiting
                 bottleneck. This paper presents a novel compiler
                 framework, called Facade, that can generate
                 highly-efficient data manipulation code by
                 automatically transforming the data path of an existing
                 Big Data application. The key treatment is that in the
                 generated code, the number of runtime heap objects
                 created for data types in each thread is (almost)
                 statically bounded, leading to significantly reduced
                 memory management cost and improved scalability. We
                 have implemented Facade and used it to transform 7
                 common applications on 3 real-world, already
                 well-optimized Big Data frameworks: GraphChi, Hyracks,
                 and GPS. Our experimental results are very positive:
                 the generated programs have (1) achieved a 3\%--48\%
                 execution time reduction and an up to 88X GC reduction;
                 (2) consumed up to 50\% less memory, and (3) scaled to
                 much larger datasets.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Agrawal:2015:ASD,
  author =       "Varun Agrawal and Abhiroop Dabral and Tapti Palit and
                 Yongming Shen and Michael Ferdman",
  title =        "Architectural Support for Dynamic Linking",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "4",
  pages =        "691--702",
  month =        apr,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2775054.2694392",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue May 12 17:41:19 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "All software in use today relies on libraries,
                 including standard libraries (e.g., C, C++) and
                 application-specific libraries (e.g., libxml, libpng).
                 Most libraries are loaded in memory and dynamically
                 linked when programs are launched, resolving symbol
                 addresses across the applications and libraries.
                 Dynamic linking has many benefits: It allows code to be
                 reused between applications, conserves memory (because
                 only one copy of a library is kept in memory for all
                 the applications that share it), and allows libraries
                 to be patched and updated without modifying programs,
                 among numerous other benefits. However, these benefits
                 come at the cost of performance. For every call made to
                 a function in a dynamically linked library, a
                 trampoline is used to read the function address from a
                 lookup table and branch to the function, incurring
                 memory load and branch operations. Static linking
                 avoids this performance penalty, but loses all the
                 benefits of dynamic linking. Given its myriad benefits,
                 dynamic linking is the predominant choice today,
                 despite the performance cost. In this work, we propose
                 a speculative hardware mechanism to optimize dynamic
                 linking by avoiding executing the trampolines for
                 library function calls, providing the benefits of
                 dynamic linking with the performance of static linking.
                 Speculatively skipping the memory load and branch
                 operations of the library call trampolines improves
                 performance by reducing the number of executed
                 instructions and gains additional performance by
                 reducing pressure on the instruction and data caches,
                 TLBs, and branch predictors. Because the indirect
                 targets of library call trampolines do not change
                 during program execution, our speculative mechanism
                 never misspeculates in practice. We evaluate our
                 technique on real hardware with production software and
                 observe up to 4\% speedup using only 1.5KB of on-chip
                 storage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '15 conference proceedings.",
}

@Article{Baird:2015:OTC,
  author =       "Ryan Baird and Peter Gavin and Magnus Sj{\"a}lander
                 and David Whalley and Gang-Ryung Uh",
  title =        "Optimizing Transfers of Control in the Static Pipeline
                 Architecture",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754952",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Statically pipelined processors offer a new way to
                 improve the performance beyond that of a traditional
                 in-order pipeline while simultaneously reducing energy
                 usage by enabling the compiler to control more
                 fine-grained details of the program execution. This
                 paper describes how a compiler can exploit the features
                 of the static pipeline architecture to apply
                 optimizations on transfers of control that are not
                 possible on a conventional architecture. The
                 optimizations presented in this paper include hoisting
                 the target address calculations for branches, jumps,
                 and calls out of loops, performing branch chaining
                 between calls and jumps, hoisting the setting of return
                 addresses out of loops, and exploiting conditional
                 calls and returns. The benefits of performing these
                 transfer of control optimizations include a 6.8\%
                 reduction in execution time and a 3.6\% decrease in
                 estimated energy usage.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Liu:2015:CCD,
  author =       "Qingrui Liu and Changhee Jung and Dongyoon Lee and
                 Devesh Tiwari",
  title =        "{Clover}: Compiler Directed Lightweight Soft Error
                 Resilience",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754959",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents Clover, a compiler directed soft
                 error detection and recovery scheme for lightweight
                 soft error resilience. The compiler carefully generates
                 soft error tolerant code based on idempotent processing
                 without explicit checkpoint. During program execution,
                 Clover relies on a small number of acoustic wave
                 detectors deployed in the processor to identify soft
                 errors by sensing the wave made by a particle strike.
                 To cope with DUE (detected unrecoverable errors) caused
                 by the sensing latency of error detection, Clover
                 leverages a novel selective instruction duplication
                 technique called tail-DMR (dual modular redundancy).
                 Once a soft error is detected by either the sensor or
                 the tail-DMR, Clover takes care of the error as in the
                 case of exception handling. To recover from the error,
                 Clover simply redirects program control to the
                 beginning of the code region where the error is
                 detected. The experiment results demonstrate that the
                 average runtime overhead is only 26\%, which is a 75\%
                 reduction compared to that of the state-of-the-art soft
                 error resilience technique.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Bardizbanyan:2015:IDA,
  author =       "Alen Bardizbanyan and Magnus Sj{\"a}lander and David
                 Whalley and Per Larsson-Edefors",
  title =        "Improving Data Access Efficiency by Using
                 Context-Aware Loads and Stores",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754960",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Memory operations have a significant impact on both
                 performance and energy usage even when an access hits
                 in the level-one data cache (L1 DC). Load instructions
                 in particular affect performance as they frequently
                 result in stalls since the register to be loaded is
                 often referenced before the data is available in the
                 pipeline. L1 DC accesses also impact energy usage as
                 they typically require significantly more energy than a
                 register file access. Despite their impact on
                 performance and energy usage, L1 DC accesses on most
                 processors are performed in a general fashion without
                 regard to the context in which the load or store
                 operation is performed. We describe a set of techniques
                 where the compiler enhances load and store instructions
                 so that they can be executed with fewer stalls and/or
                 enable the L1 DC to be accessed in a more
                 energy-efficient manner. We show that using these
                 techniques can simultaneously achieve a 6\% gain in
                 performance and a 43\% reduction in L1 DC energy
                 usage.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Stilkerich:2015:PGA,
  author =       "Isabella Stilkerich and Clemens Lang and Christoph
                 Erhardt and Michael Stilkerich",
  title =        "A Practical Getaway: Applications of Escape Analysis
                 in Embedded Real-Time Systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754961",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The use of a managed, type-safe language such as Java
                 in real-time and embedded systems offers productivity
                 and, in particular, safety and dependability benefits
                 at a reasonable cost. It has been shown for commodity
                 systems that escape analysis (EA) enables a set of
                 useful optimization, and benefits from the properties
                 of a type-safe language. In this paper, we explore the
                 application of escape analysis in KESO [34], a Java
                 ahead-of-time compiler targeting (deeply) embedded
                 real-time systems. We present specific applications of
                 EA for embedded programs that go beyond the widely
                 known stack-allocation and synchronization
                 optimizations such as extended remote procedure call
                 support for software-isolated applications, automated
                 inference of immutable data or improved upper space and
                 time bounds for worst-case estimations.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Manilov:2015:FRT,
  author =       "Stanislav Manilov and Bj{\"o}rn Franke and Anthony
                 Magrath and Cedric Andrieu",
  title =        "Free Rider: a Tool for Retargeting Platform-Specific
                 Intrinsic Functions",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754962",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Short-vector SIMD and DSP instructions are popular
                 extensions to common ISAs. These extensions deliver
                 excellent performance and compact code for some
                 compute-intensive applications, but they require
                 specialised compiler support. To enable the programmer
                 to explicitly request the use of such an instruction,
                 many C compilers provide platform-specific intrinsic
                 functions, whose implementation is handled specially by
                 the compiler. The use of such intrinsics, however,
                 inevitably results in non-portable code. In this paper
                 we develop a novel methodology for retargeting such
                 non-portable code, which maps intrinsics from one
                 platform to another, taking advantage of similar
                 intrinsics on the target platform. We employ a
                 description language to specify the signature and
                 semantics of intrinsics and perform graph-based pattern
                 matching and high-level code transformations to derive
                 optimised implementations exploiting the target's
                 intrinsics, wherever possible. We demonstrate the
                 effectiveness of our new methodology, implemented in
                 the FREE RIDER tool, by automatically retargeting
                 benchmarks derived from OpenCV samples and a complex
                 embedded application optimised to run on an Arm
                 Cortex-M4 to an Intel Edison module with Sse4.2
                 instructions. We achieve a speedup of up to 3.73 over a
                 plain C baseline, and on average 96.0\% of the speedup
                 of manually ported and optimised versions of the
                 benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Dietrich:2015:CKC,
  author =       "Christian Dietrich and Martin Hoffmann and Daniel
                 Lohmann",
  title =        "Cross-Kernel Control-Flow--Graph Analysis for
                 Event-Driven Real-Time Systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754963",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Embedded real-time control systems generally have a
                 dedicated purpose and fixed set of functionalities.
                 This manifests in a large amount of implicit and
                 explicit static knowledge, available already at compile
                 time. Modern compilers can extract and exploit this
                 information to perform extensive whole-program analyses
                 and interprocedural optimizations. However, these
                 analyses typically end at the application--kernel
                 boundary, thus control-flow transitions between
                 different threads are not covered, yet. This
                 restriction stems from the pessimistic assumption of a
                 probabilistic scheduling policy of the underlying
                 operating system, impeding detailed predictions of the
                 overall system behavior. Real-time operating systems,
                 however, do provide deterministic and exactly specified
                 scheduling decisions, as embedded control systems rely
                 on a timely and precise behavior. In this paper, we
                 present an approach that incorporates the RTOS
                 semantics into the control-flow analysis, to cross the
                 application--kernel boundary. By combining operating
                 system semantics, the static system configuration and
                 the application logic, we determine a cross-kernel
                 control-flow--graph, that provides a global view on all
                 possible execution paths of a real-time system. Having
                 this knowledge at hand, enables us to tailor the
                 operating system kernel more closely to the particular
                 application scenario. On the example of a real-world
                 safety-critical control system, we present two possible
                 use cases: Run-time optimizations, by means of
                 specialized system calls for each call site, allow to
                 speed up the kernel execution path by 33 percent in our
                 benchmark scenario. An automated generation of OS state
                 assertions on the expected system behavior, targeting
                 transient hardware fault tolerance, leverages
                 significant robustness improvements.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Ghosh:2015:EEA,
  author =       "Soumyadeep Ghosh and Yongjun Park and Arun Raman",
  title =        "Enabling Efficient Alias Speculation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754964",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Microprocessors designed using HW/SW codesign
                 principles, such as Transmeta{\TM} Efficeon{\TM} and
                 the soon-to-ship NVIDIA 64-bit Tegra{\reg} K1, use
                 dynamic binary optimization to extract
                 instruction-level parallelism. Many code optimizations
                 are made significantly more effective through the use
                 of alias speculation. The state-of-the-art alias
                 speculation system, SMARQ, provides 40\% speedup on
                 average over a system with no alias speculation. This
                 performance, however, comes at the cost of introducing
                 new alias registers and increased power consumption due
                 to new checks for validating speculation. Consequently,
                 improving the efficiency of alias speculation by
                 reducing alias register requirements and rationalizing
                 speculation validation checks is critical for the
                 viability of SMARQ. This paper presents alias
                 coalescing, a novel technique to significantly improve
                 the efficiency of SMARQ through a synergistic
                 combination of compiler and microarchitectural
                 techniques. By using a more compact encoding for memory
                 access ranges for memory instructions, alias coalescing
                 simultaneously reduces the alias register pressure in
                 SMARQ by a geomean of 26.09\% and 39.96\%, and the
                 dynamic alias checks by 20.73\% and 33.87\%, across the
                 entire SPEC CINT2006 and SPEC CFP2006 suites
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Zheng:2015:WAD,
  author =       "Wenguang Zheng and Hui Wu",
  title =        "{WCET-Aware} Dynamic {D}-cache Locking for a Single
                 Task",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754965",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Caches have been extensively used to bridge the
                 increasing speed gap between processors and off-chip
                 memory. However, caches make it much harder to compute
                 the WCET (Worst-Case Execution Time) of a program.
                 Cache locking is an effective technique for overcoming
                 the unpredictability problem of caches. We investigate
                 the WCET aware D-cache locking problem for a single
                 task, and propose two dynamic cache locking approaches.
                 The first approach formulates the problem as a global
                 ILP (Integer Linear Programming) problem that
                 simultaneously selects a near-optimal set of variables
                 as the locked cache contents and allocates them to the
                 D-cache. The second one iteratively constructs a
                 subgraph of the CFG of the task where the lengths of
                 all the paths are close to the longest path length, and
                 uses an ILP formulation to select a near-optimal set of
                 variables in the subgraph as the locked cache contents
                 and allocate them to the D-cache. For both approaches,
                 we propose a novel, efficient D-cache allocation
                 algorithm. We have implemented both approaches and
                 compared them with the longest path-based, dynamic
                 cache locking approach proposed in [22] and the static
                 WCET analysis approach without cache locking proposed
                 in [14] by using a set of benchmarks from the
                 M{\"a}lardalen WCET benchmark suite, SNU real-time
                 benchmarks and the benchmarks used in [27]. Compared to
                 the static WCET analysis approach, the average WCET
                 improvements of the first approach range between 11.3\%
                 and 31.6\%, and the average WCET improvements of the
                 second approach range between 12.3\% and 32.9\%.
                 Compared to the longest path-based, dynamic cache
                 locking approach, the average WCET improvements of the
                 first approach range between 4.7\% and 14.3\%, and the
                 average WCET improvements of the second approach range
                 between 5.3\% and 15.0\%.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Lin:2015:STU,
  author =       "Yixiao Lin and Sayan Mitra",
  title =        "{StarL}: Towards a Unified Framework for Programming,
                 Simulating and Verifying Distributed Robotic Systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754966",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We developed StarL as a framework for programming,
                 simulating, and verifying distributed systems that
                 interacts with physical processes. StarL framework has
                 (a) a collection of distributed primitives for
                 coordination, such as mutual exclusion, registration
                 and geocast that can be used to build sophisticated
                 applications, (b) theory libraries for verifying StarL
                 applications in the PVS theorem prover, and (c) an
                 execution environment that can be used to deploy the
                 applications on hardware or to execute them in a
                 discrete event simulator. The primitives have (i)
                 abstract, nondeterministic specifications in terms of
                 invariants, and assume-guarantee style progress
                 properties, (ii) implementations in Java/Android that
                 always satisfy the invariants and attempt progress
                 using best effort strategies. The PVS theories specify
                 the invariant and progress properties of the
                 primitives, and have to be appropriately instantiated
                 and composed with the application's state machine to
                 prove properties about the application. We have built
                 two execution environments: one for deploying
                 applications on Android/iRobot Create platform and a
                 second one for simulating large instantiations of the
                 applications in a discrete even simulator. The
                 capabilities are illustrated with a StarL application
                 for vehicle to vehicle coordination in an automatic
                 intersection that uses primitives for point-to-point
                 motion, mutual exclusion, and registration.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Zhang:2015:IPA,
  author =       "Zhenkai Zhang and Xenofon Koutsoukos",
  title =        "Improving the Precision of Abstract Interpretation
                 Based Cache Persistence Analysis",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754967",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When designing hard real-time embedded systems, it is
                 required to estimate the worst-case execution time
                 (WCET) of each task for schedulability analysis.
                 Precise cache persistence analysis can significantly
                 tighten the WCET estimation, especially when the
                 program has many loops. Methods for persistence
                 analysis should safely and precisely classify memory
                 references as persistent. Existing safe approaches
                 suffer from multiple sources of pessimism and may not
                 provide precise results. In this paper, we first
                 identify some sources of pessimism that two recent
                 approaches based on younger set and may analysis may
                 encounter. Then, we propose two methods to eliminate
                 these sources of pessimism. The first method improves
                 the update function of the may analysis-based approach;
                 and the second method integrates the younger set-based
                 and may analysis-based approaches together to further
                 reduce pessimism. We also prove the two proposed
                 methods are still safe. We evaluate the approaches on a
                 set of benchmarks and observe the number of memory
                 references classified as persistent is increased by the
                 proposed methods. Moreover, we empirically compare the
                 storage space and analysis time used by different
                 methods.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Barijough:2015:IAM,
  author =       "Kamyar Mirzazad Barijough and Matin Hashemi and
                 Volodymyr Khibin and Soheil Ghiasi",
  title =        "Implementation-Aware Model Analysis: The Case of
                 Buffer-Throughput Tradeoff in Streaming Applications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "11:1--11:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754968",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Models of computation abstract away a number of
                 implementation details in favor of well-defined
                 semantics. While this has unquestionable benefits, we
                 argue that analysis of models solely based on
                 operational semantics (implementation-oblivious
                 analysis) is unfit to drive implementation design space
                 exploration. Specifically, we study the tradeoff
                 between buffer size and streaming throughput in
                 applications modeled as synchronous data flow (SDF)
                 graphs. We demonstrate the inherent inaccuracy of
                 implementation-oblivious approach, which only considers
                 SDF operational semantic. We propose a rigorous
                 transformation, which equips the state of the art
                 buffer-throughput tradeoff analysis technique with
                 implementation awareness. Extensive empirical
                 evaluation show that our approach results in
                 significantly more accurate estimates in streaming
                 throughput at the model level, while running two orders
                 of magnitude faster than cycle-accurate simulation of
                 implementations.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Liu:2015:SDS,
  author =       "Chen Liu and Chengmo Yang",
  title =        "Secure and Durable {(SEDURA)}: an Integrated
                 Encryption and Wear-leveling Framework for {PCM}-based
                 Main Memory",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754969",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Phase changing memory (PCM) is considered a promising
                 candidate for next-generation main-memory. Despite its
                 advantages of lower power and high density, PCM faces
                 critical security challenges due to its non-volatility:
                 data are still accessible by the attacker even if the
                 device is detached from a power supply. While
                 encryption has been widely adopted as the solution to
                 protect data, it not only creates additional
                 performance and energy overhead during data
                 encryption/decryption, but also hurts PCM lifetime by
                 introducing more writes to PCM cells. In this paper, we
                 propose a framework that integrates encryption and
                 wear-leveling so as to mitigate the adverse impact of
                 encryption on PCM performance and lifetime. Moreover,
                 by randomizing the address space during wear-leveling,
                 an extra level of protection is provided to the data in
                 memory. We propose two algorithms that respectively
                 prioritize data security and memory lifetime, allowing
                 designers to trade-off between these two factors based
                 on their needs. Compared to previous encryption
                 techniques, the proposed SEDURA framework is able to
                 deliver both more randomness to protect data and more
                 balanced PCM writes, thus effectively balancing the
                 three aspects of data security, application
                 performance, and device lifetime.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Procter:2015:SDH,
  author =       "Adam Procter and William L. Harrison and Ian Graves
                 and Michela Becchi and Gerard Allwein",
  title =        "Semantics Driven Hardware Design, Implementation, and
                 Verification with {ReWire}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754970",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There is no such thing as high assurance without high
                 assurance hardware. High assurance hardware is
                 essential, because any and all high assurance systems
                 ultimately depend on hardware that conforms to, and
                 does not undermine, critical system properties and
                 invariants. And yet, high assurance hardware
                 development is stymied by the conceptual gap between
                 formal methods and hardware description languages used
                 by engineers. This paper presents ReWire, a functional
                 programming language providing a suitable foundation
                 for formal verification of hardware designs, and a
                 compiler for that language that translates high-level,
                 semantics-driven designs directly into working
                 hardware. ReWire's design and implementation are
                 presented, along with a case study in the design of a
                 secure multicore processor, demonstrating both ReWire's
                 expressiveness as a programming language and its power
                 as a framework for formal, high-level reasoning about
                 hardware systems.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Woithe:2015:TPA,
  author =       "Hans Christian Woithe and Ulrich Kremer",
  title =        "{TrilobiteG}: a programming architecture for
                 autonomous underwater vehicles",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "5",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2808704.2754971",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Jul 31 19:39:44 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming autonomous systems can be challenging
                 because many programming decisions must be made in real
                 time and under stressful conditions, such as on a
                 battle field, during a short communication window, or
                 during a storm at sea. As such, new programming designs
                 are needed to reflect these specific and extreme
                 challenges. TrilobiteG is a programming architecture
                 for buoyancy-driven autonomous underwater vehicles
                 (AUVs), called gliders. Gliders are designed to spend
                 weeks to months in the ocean, where they operate fully
                 autonomously while submerged and can only communicate
                 via satellite during their limited time at the surface.
                 Based on the experience gained from a seven year long
                 collaboration with two oceanographic institutes, the
                 TrilobiteG architecture has been developed with the
                 main goal of enabling users to run more effective
                 missions. The TrilobiteG programming environment
                 consists of a domain-specific language called ALGAE, a
                 lower level service layer, and a set of real-time and
                 faster-than-real-time simulators. The system has been
                 used to program novel and robust glider behaviors, as
                 well as to find software problems that otherwise may
                 have remained undetected, with potentially catastrophic
                 results. We believe that TrilobiteG can serve as a
                 blueprint for other autonomous systems as well, and
                 that TrilobiteG will motivate and enable a broader
                 scientific community to work on extreme, real-world
                 problems by using the simulation infrastructure.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '15 conference proceedings.",
}

@Article{Panchekha:2015:AIA,
  author =       "Pavel Panchekha and Alex Sanchez-Stern and James R.
                 Wilcox and Zachary Tatlock",
  title =        "Automatically improving accuracy for floating point
                 expressions",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "1--11",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737959",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scientific and engineering applications depend on
                 floating point arithmetic to approximate real
                 arithmetic. This approximation introduces rounding
                 error, which can accumulate to produce unacceptable
                 results. While the numerical methods literature
                 provides techniques to mitigate rounding error,
                 applying these techniques requires manually rearranging
                 expressions and understanding the finer details of
                 floating point arithmetic. We introduce Herbie, a tool
                 which automatically discovers the rewrites experts
                 perform to improve accuracy. Herbie's heuristic search
                 estimates and localizes rounding error using sampled
                 points (rather than static error analysis), applies a
                 database of rules to generate improvements, takes
                 series expansions, and combines improvements for
                 different input regions. We evaluated Herbie on
                 examples from a classic numerical methods textbook, and
                 found that Herbie was able to improve accuracy on each
                 example, some by up to 60 bits, while imposing a median
                 performance overhead of 40\%. Colleagues in machine
                 learning have used Herbie to significantly improve the
                 results of a clustering algorithm, and a mathematical
                 library has accepted two patches generated using
                 Herbie.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Zhang:2015:DTE,
  author =       "Danfeng Zhang and Andrew C. Myers and Dimitrios
                 Vytiniotis and Simon Peyton-Jones",
  title =        "Diagnosing type errors with class",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "12--21",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738009",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Type inference engines often give terrible error
                 messages, and the more sophisticated the type system
                 the worse the problem. We show that even with the
                 highly expressive type system implemented by the
                 Glasgow Haskell Compiler (GHC)--including type classes,
                 GADTs, and type families--it is possible to identify
                 the most likely source of the type error, rather than
                 the first source that the inference engine trips over.
                 To determine which are the likely error sources, we
                 apply a simple Bayesian model to a graph representation
                 of the typing constraints; the satisfiability or
                 unsatisfiability of paths within the graph provides
                 evidence for or against possible explanations. While we
                 build on prior work on error diagnosis for simpler type
                 systems, inference in the richer type system of Haskell
                 requires extending the graph with new nodes. The
                 augmentation of the graph creates challenges both for
                 Bayesian reasoning and for ensuring termination. Using
                 a large corpus of Haskell programs, we show that this
                 error localization technique is practical and
                 significantly improves accuracy over the state of the
                 art.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Lopes:2015:PCP,
  author =       "Nuno P. Lopes and David Menendez and Santosh
                 Nagarakatte and John Regehr",
  title =        "Provably correct peephole optimizations with {Alive}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "22--32",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737965",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compilers should not miscompile. Our work addresses
                 problems in developing peephole optimizations that
                 perform local rewriting to improve the efficiency of
                 LLVM code. These optimizations are individually
                 difficult to get right, particularly in the presence of
                 undefined behavior; taken together they represent a
                 persistent source of bugs. This paper presents Alive, a
                 domain-specific language for writing optimizations and
                 for automatically either proving them correct or else
                 generating counterexamples. Furthermore, Alive can be
                 automatically translated into C++ code that is suitable
                 for inclusion in an LLVM optimization pass. Alive is
                 based on an attempt to balance usability and formal
                 methods; for example, it captures---but largely
                 hides---the detailed semantics of three different kinds
                 of undefined behavior in LLVM. We have translated more
                 than 300 LLVM optimizations into Alive and, in the
                 process, found that eight of them were wrong.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Faddegon:2015:ADR,
  author =       "Maarten Faddegon and Olaf Chitil",
  title =        "Algorithmic debugging of real-world {Haskell}
                 programs: deriving dependencies from the cost centre
                 stack",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "33--42",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737985",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Existing algorithmic debuggers for Haskell require a
                 transformation of all modules in a program, even
                 libraries that the user does not want to debug and
                 which may use language features not supported by the
                 debugger. This is a pity, because a promising approach
                 to debugging is therefore not applicable to many
                 real-world programs. We use the cost centre stack from
                 the Glasgow Haskell Compiler profiling environment
                 together with runtime value observations as provided by
                 the Haskell Object Observation Debugger (HOOD) to
                 collect enough information for algorithmic debugging.
                 Program annotations are in suspected modules only. With
                 this technique algorithmic debugging is applicable to a
                 much larger set of Haskell programs. This demonstrates
                 that for functional languages in general a simple stack
                 trace extension is useful to support tasks such as
                 profiling and debugging.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Sidiroglou-Douskos:2015:AEE,
  author =       "Stelios Sidiroglou-Douskos and Eric Lahtinen and Fan
                 Long and Martin Rinard",
  title =        "Automatic error elimination by horizontal code
                 transfer across multiple applications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "43--54",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737988",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Code Phage (CP), a system for automatically
                 transferring correct code from donor applications into
                 recipient applications that process the same inputs to
                 successfully eliminate errors in the recipient.
                 Experimental results using seven donor applications to
                 eliminate ten errors in seven recipient applications
                 highlight the ability of CP to transfer code across
                 applications to eliminate out of bounds access, integer
                 overflow, and divide by zero errors. Because CP works
                 with binary donors with no need for source code or
                 symbolic information, it supports a wide range of use
                 cases. To the best of our knowledge, CP is the first
                 system to automatically transfer code across multiple
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Liu:2015:LRT,
  author =       "Peng Liu and Xiangyu Zhang and Omer Tripp and Yunhui
                 Zheng",
  title =        "{Light}: replay via tightly bounded recording",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "55--64",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738001",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reproducing concurrency bugs is a prominent challenge.
                 Existing techniques either rely on recording very fine
                 grained execution information and hence have high
                 runtime overhead, or strive to log as little
                 information as possible but provide no guarantee in
                 reproducing a bug. We present Light, a technique that
                 features much lower overhead compared to techniques
                 based on fine grained recording, and that guarantees to
                 reproduce concurrent bugs. We leverage and formally
                 prove that recording flow dependences is the necessary
                 and sufficient condition to reproduce a concurrent bug.
                 The flow dependences, together with the thread local
                 orders that can be automatically inferred (and hence
                 not logged), are encoded as scheduling constraints. An
                 SMT solver is used to derive a replay schedule, which
                 is guaranteed to exist even though it may be different
                 from the original schedule. Our experiments show that
                 Light has only 44\% logging overhead, almost one order
                 of magnitude lower than the state of the art techniques
                 relying on logging memory accesses. Its space overhead
                 is only 10\% of those techniques. Light can also
                 reproduce all the bugs we have collected whereas
                 existing techniques miss some of them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Lidbury:2015:MCC,
  author =       "Christopher Lidbury and Andrei Lascu and Nathan Chong
                 and Alastair F. Donaldson",
  title =        "Many-core compiler fuzzing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "65--76",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737986",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We address the compiler correctness problem for
                 many-core systems through novel applications of fuzz
                 testing to OpenCL compilers. Focusing on two methods
                 from prior work, random differential testing and
                 testing via equivalence modulo inputs (EMI), we present
                 several strategies for random generation of
                 deterministic, communicating OpenCL kernels, and an
                 injection mechanism that allows EMI testing to be
                 applied to kernels that otherwise exhibit little or no
                 dynamically-dead code. We use these methods to conduct
                 a large, controlled testing campaign with respect to 21
                 OpenCL (device, compiler) configurations, covering a
                 range of CPU, GPU, accelerator, FPGA and emulator
                 implementations. Our study provides independent
                 validation of claims in prior work related to the
                 effectiveness of random differential testing and EMI
                 testing, proposes novel methods for lifting these
                 techniques to the many-core setting and reveals a
                 significant number of OpenCL compiler bugs in
                 commercial implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Sergey:2015:MVF,
  author =       "Ilya Sergey and Aleksandar Nanevski and Anindya
                 Banerjee",
  title =        "Mechanized verification of fine-grained concurrent
                 programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "77--87",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737964",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Efficient concurrent programs and data structures
                 rarely employ coarse-grained synchronization mechanisms
                 (i.e., locks); instead, they implement custom
                 synchronization patterns via fine-grained primitives,
                 such as compare-and-swap. Due to sophisticated
                 interference scenarios between threads, reasoning about
                 such programs is challenging and error-prone, and can
                 benefit from mechanization. In this paper, we present
                 the first completely formalized framework for
                 mechanized verification of full functional correctness
                 of fine-grained concurrent programs. Our tool is based
                 on the recently proposed program logic FCSL. It is
                 implemented as an embedded DSL in the dependently-typed
                 language of the Coq proof assistant, and is powerful
                 enough to reason about programming features such as
                 higher-order functions and local thread spawning. By
                 incorporating a uniform concurrency model, based on
                 state-transition systems and partial commutative
                 monoids, FCSL makes it possible to build proofs about
                 concurrent libraries in a thread-local, compositional
                 way, thus facilitating scalability and reuse: libraries
                 are verified just once, and their specifications are
                 used ubiquitously in client-side reasoning. We
                 illustrate the proof layout in FCSL by example, outline
                 its infrastructure, and report on our experience of
                 using FCSL to verify a number of concurrent algorithms
                 and data structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Sharma:2015:VPC,
  author =       "Rahul Sharma and Michael Bauer and Alex Aiken",
  title =        "Verification of producer-consumer synchronization in
                 {GPU} programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "88--98",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737962",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Previous efforts to formally verify code written for
                 GPUs have focused solely on kernels written within the
                 traditional data-parallel GPU programming model. No
                 previous work has considered the higher performance,
                 but more complex, warp-specialized kernels based on
                 producer-consumer named barriers available on current
                 hardware. In this work we present the first formal
                 operational semantics for named barriers and define
                 what it means for a warp-specialized kernel to be
                 correct. We give algorithms for verifying the
                 correctness of warp-specialized kernels and prove that
                 they are both sound and complete for the most common
                 class of warp-specialized programs. We also present
                 WEFT, a verification tool for checking warp-specialized
                 code. Using WEFT, we discover several non-trivial bugs
                 in production warp-specialized kernels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Gammie:2015:RSV,
  author =       "Peter Gammie and Antony L. Hosking and Kai
                 Engelhardt",
  title =        "Relaxing safely: verified on-the-fly garbage
                 collection for {x86-TSO}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "99--109",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738006",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We report on a machine-checked verification of safety
                 for a state-of-the-art, on-the-fly, concurrent,
                 mark-sweep garbage collector that is designed for
                 multi-core architectures with weak memory consistency.
                 The proof explicitly incorporates the relaxed memory
                 semantics of x86 multiprocessors. To our knowledge,
                 this is the first fully machine-checked proof of safety
                 for such a garbage collector. We couch the proof in a
                 framework that system implementers will find appealing,
                 with the fundamental components of the system specified
                 in a simple and intuitive programming language. The
                 abstract model is detailed enough for its
                 correspondence with an assembly language implementation
                 to be straightforward.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Tassarotti:2015:VRC,
  author =       "Joseph Tassarotti and Derek Dreyer and Viktor
                 Vafeiadis",
  title =        "Verifying read-copy-update in a logic for weak
                 memory",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "110--120",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737992",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Read-Copy-Update (RCU) is a technique for letting
                 multiple readers safely access a data structure while a
                 writer concurrently modifies it. It is used heavily in
                 the Linux kernel in situations where fast reads are
                 important and writes are infrequent. Optimized
                 implementations rely only on the weaker memory
                 orderings provided by modern hardware, avoiding the
                 need for expensive synchronization instructions (such
                 as memory barriers) as much as possible. Using GPS, a
                 recently developed program logic for the C/C++11 memory
                 model, we verify an implementation of RCU for a
                 singly-linked list assuming ``release-acquire''
                 semantics. Although release-acquire synchronization is
                 stronger than what is required by real RCU
                 implementations, it is nonetheless significantly weaker
                 than the assumption of sequential consistency made in
                 prior work on RCU verification. Ours is the first
                 formal proof of correctness for an implementation of
                 RCU under a weak memory model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Ko:2015:LCT,
  author =       "Yousun Ko and Bernd Burgstaller and Bernhard Scholz",
  title =        "{LaminarIR}: compile-time queues for structured
                 streams",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "121--130",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737994",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Stream programming languages employ FIFO (first-in,
                 first-out) semantics to model data channels between
                 producers and consumers. A FIFO data channel stores
                 tokens in a buffer that is accessed indirectly via
                 read- and write-pointers. This indirect token-access
                 decouples a producer's write-operations from the
                 read-operations of the consumer, thereby making
                 dataflow implicit. For a compiler, indirect
                 token-access obscures data-dependencies, which renders
                 standard optimizations ineffective and impacts stream
                 program performance negatively. In this paper we
                 propose a transformation for structured stream
                 programming languages such as StreamIt that shifts FIFO
                 buffer management from run-time to compile-time and
                 eliminates splitters and joiners, whose task is to
                 distribute and merge streams. To show the effectiveness
                 of our lowering transformation, we have implemented a
                 StreamIt to C compilation framework. We have developed
                 our own intermediate representation (IR) called
                 LaminarIR, which facilitates the transformation. We
                 report on the enabling effect of the LaminarIR on
                 LLVM's optimizations, which required the conversion of
                 several standard StreamIt benchmarks from static to
                 randomized input, to prevent computation of partial
                 results at compile-time. We conducted our experimental
                 evaluation on the Intel i7-2600K, AMD Opteron 6378,
                 Intel Xeon Phi 3120A and ARM Cortex-A15 platforms. Our
                 LaminarIR reduces data-communication on average by
                 35.9\% and achieves platform-specific speedups between
                 3.73x and 4.98x over StreamIt. We reduce memory
                 accesses by more than 60\% and achieve energy savings
                 of up to 93.6\% on the Intel i7-2600K.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Ding:2015:OCA,
  author =       "Wei Ding and Xulong Tang and Mahmut Kandemir and
                 Yuanrui Zhang and Emre Kultursay",
  title =        "Optimizing off-chip accesses in multicores",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "131--142",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737989",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In a network-on-chip (NoC) based manycore
                 architecture, an off-chip data access (main memory
                 access) needs to travel through the on-chip network,
                 spending considerable amount of time within the chip
                 (in addition to the memory access latency). In
                 addition, it contends with on-chip (cache) accesses as
                 both use the same NoC resources. In this paper,
                 focusing on data-parallel, multithreaded applications,
                 we propose a compiler-based off-chip data access
                 localization strategy, which places data elements in
                 the memory space such that an off-chip access traverses
                 a minimum number of links (hops) to reach the memory
                 controller that handles this access. This brings three
                 main benefits. First, the network latency of off-chip
                 accesses gets reduced; second, the network latency of
                 on-chip accesses gets reduced; and finally, the memory
                 latency of off-chip accesses improves, due to reduced
                 queue latencies. We present an experimental evaluation
                 of our optimization strategy using a set of 13
                 multithreaded application programs under both private
                 and shared last-level caches. The results collected
                 emphasize the importance of optimizing the off-chip
                 data accesses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Mehta:2015:ICS,
  author =       "Sanyam Mehta and Pen-Chung Yew",
  title =        "Improving compiler scalability: optimizing large
                 programs at small price",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "143--152",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737954",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compiler scalability is a well known problem:
                 reasoning about the application of useful optimizations
                 over large program scopes consumes too much time and
                 memory during compilation. This problem is exacerbated
                 in polyhedral compilers that use powerful yet costly
                 integer programming algorithms to compose loop
                 optimizations. As a result, the benefits that a
                 polyhedral compiler has to offer to programs such as
                 real scientific applications that contain sequences of
                 loop nests, remain impractical for the common users. In
                 this work, we address this scalability problem in
                 polyhedral compilers. We identify three causes of
                 unscalability, each of which stems from large number of
                 statements and dependences in the program scope. We
                 propose a one-shot solution to the problem by reducing
                 the effective number of statements and dependences as
                 seen by the compiler. We achieve this by representing a
                 sequence of statements in a program by a single
                 super-statement. This set of super-statements exposes
                 the minimum sufficient constraints to the Integer
                 Linear Programming (ILP) solver for finding correct
                 optimizations. We implement our approach in the PLuTo
                 polyhedral compiler and find that it condenses the
                 program statements and program dependences by factors
                 of 4.7x and 6.4x, respectively, averaged over 9 hot
                 regions (ranging from 48 to 121 statements) in 5 real
                 applications. As a result, the improvements in time and
                 memory requirement for compilation are 268x and 20x,
                 respectively, over the latest version of the PLuTo
                 compiler. The final compile times are comparable to the
                 Intel compiler while the performance is 1.92x better on
                 average due to the latter's conservative approach to
                 loop optimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Appel:2015:VCP,
  author =       "Andrew W. Appel",
  title =        "Verification of a cryptographic primitive: {SHA-256}
                 (abstract)",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "153--153",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2774972",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A full formal machine-checked verification of a C
                 program: the OpenSSL implementation of SHA-256. This is
                 an interactive proof of functional correctness in the
                 Coq proof assistant, using the Verifiable C program
                 logic. Verifiable C is a separation logic for the C
                 language, proved sound w.r.t. the operational semantics
                 for C, connected to the CompCert verified optimizing C
                 compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Deligiannis:2015:APA,
  author =       "Pantazis Deligiannis and Alastair F. Donaldson and
                 Jeroen Ketema and Akash Lal and Paul Thomson",
  title =        "Asynchronous programming, analysis and testing with
                 state machines",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "154--164",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737996",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming efficient asynchronous systems is
                 challenging because it can often be hard to express the
                 design declaratively, or to defend against data races
                 and interleaving-dependent assertion violations.
                 Previous work has only addressed these challenges in
                 isolation, by either designing a new declarative
                 language, a new data race detection tool or a new
                 testing technique. We present P\#, a language for
                 high-reliability asynchronous programming co-designed
                 with a static data race analysis and systematic
                 concurrency testing infrastructure. We describe our
                 experience using P\# to write several distributed
                 protocols and port an industrial-scale system internal
                 to Microsoft, showing that the combined techniques, by
                 leveraging the design of P\#, are effective in finding
                 bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Huang:2015:SMC,
  author =       "Jeff Huang",
  title =        "Stateless model checking concurrent programs with
                 maximal causality reduction",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "165--174",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737975",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present maximal causality reduction (MCR), a new
                 technique for stateless model checking. MCR
                 systematically explores the state-space of concurrent
                 programs with a provably minimal number of executions.
                 Each execution corresponds to a distinct maximal causal
                 model extracted from a given execution trace, which
                 captures the largest possible set of causally
                 equivalent executions. Moreover, MCR is embarrassingly
                 parallel by shifting the runtime exploration cost to
                 offline analysis. We have designed and implemented MCR
                 using a constraint-based approach and compared with
                 iterative context bounding (ICB) and dynamic partial
                 order reduction (DPOR) on both benchmarks and
                 real-world programs. MCR reduces the number of
                 executions explored by ICB and ICB+DPOR by orders of
                 magnitude, and significantly improves the scalability,
                 efficiency, and effectiveness of the state-of-the-art
                 for both state-space exploration and bug finding. In
                 our experiments, MCR has also revealed several new data
                 races and null pointer dereference errors in frequently
                 studied real-world programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Samak:2015:SRT,
  author =       "Malavika Samak and Murali Krishna Ramanathan and
                 Suresh Jagannathan",
  title =        "Synthesizing racy tests",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "175--185",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737998",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Subtle concurrency errors in multithreaded libraries
                 that arise because of incorrect or inadequate
                 synchronization are often difficult to pinpoint
                 precisely using only static techniques. On the other
                 hand, the effectiveness of dynamic race detectors is
                 critically dependent on multithreaded test suites whose
                 execution can be used to identify and trigger races.
                 Usually, such multithreaded tests need to invoke a
                 specific combination of methods with objects involved
                 in the invocations being shared appropriately to expose
                 a race. Without a priori knowledge of the race,
                 construction of such tests can be challenging. In this
                 paper, we present a lightweight and scalable technique
                 for synthesizing precisely these kinds of tests. Given
                 a multithreaded library and a sequential test suite, we
                 describe a fully automated analysis that examines
                 sequential execution traces, and produces as its output
                 a concurrent client program that drives shared objects
                 via library method calls to states conducive for
                 triggering a race. Experimental results on a variety of
                 well-tested Java libraries yield 101 synthesized
                 multithreaded tests in less than four minutes.
                 Analyzing the execution of these tests using an
                 off-the-shelf race detector reveals 187 harmful races,
                 including several previously unreported ones. Our
                 implementation, named NARADA, and the results of our
                 experiments are available at
                 http://www.csa.iisc.ernet.in/~sss/tools/narada.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Koskinen:2015:PPM,
  author =       "Eric Koskinen and Matthew Parkinson",
  title =        "The {Push\slash Pull} model of transactions",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "186--195",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737995",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a general theory of serializability,
                 unifying a wide range of transactional algorithms,
                 including some that are yet to come. To this end, we
                 provide a compact semantics in which concurrent
                 transactions PUSH their effects into the shared view
                 (or UNPUSH to recall effects) and PULL the effects of
                 potentially uncommitted concurrent transactions into
                 their local view (or UNPULL to detangle). Each
                 operation comes with simple criteria given in terms of
                 commutativity (Lipton's left-movers and right-movers).
                 The benefit of this model is that most of the elaborate
                 reasoning (coinduction, simulation, subtle invariants,
                 etc.) necessary for proving the serializability of a
                 transactional algorithm is already proved within the
                 semantic model. Thus, proving serializability (or
                 opacity) amounts simply to mapping the algorithm on to
                 our rules, and showing that it satisfies the rules'
                 criteria.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{McClurg:2015:ESN,
  author =       "Jedidiah McClurg and Hossein Hojjat and Pavol
                 Cern{\'y} and Nate Foster",
  title =        "Efficient synthesis of network updates",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "196--207",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737980",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software-defined networking (SDN) is revolutionizing
                 the networking industry, but current SDN programming
                 platforms do not provide automated mechanisms for
                 updating global configurations on the fly. Implementing
                 updates by hand is challenging for SDN programmers
                 because networks are distributed systems with hundreds
                 or thousands of interacting nodes. Even if initial and
                 final configurations are correct, naively updating
                 individual nodes can lead to incorrect transient
                 behaviors, including loops, black holes, and access
                 control violations. This paper presents an approach for
                 automatically synthesizing updates that are guaranteed
                 to preserve specified properties. We formalize network
                 updates as a distributed programming problem and
                 develop a synthesis algorithm based on
                 counterexample-guided search and incremental model
                 checking. We describe a prototype implementation, and
                 present results from experiments on real-world
                 topologies and properties demonstrating that our tool
                 scales to updates involving over one-thousand nodes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Nori:2015:ESP,
  author =       "Aditya V. Nori and Sherjil Ozair and Sriram K.
                 Rajamani and Deepak Vijaykeerthy",
  title =        "Efficient synthesis of probabilistic programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "208--217",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737982",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We show how to automatically synthesize probabilistic
                 programs from real-world datasets. Such a synthesis is
                 feasible due to a combination of two techniques: (1) We
                 borrow the idea of ``sketching'' from synthesis of
                 deterministic programs, and allow the programmer to
                 write a skeleton program with ``holes''. Sketches
                 enable the programmer to communicate domain-specific
                 intuition about the structure of the desired program
                 and prune the search space, and (2) we design an
                 efficient Markov Chain Monte Carlo (MCMC) based
                 synthesis algorithm to instantiate the holes in the
                 sketch with program fragments. Our algorithm
                 efficiently synthesizes a probabilistic program that is
                 most consistent with the data. A core difficulty in
                 synthesizing probabilistic programs is computing the
                 likelihood L(P | D) of a candidate program P generating
                 data D. We propose an approximate method to compute
                 likelihoods using mixtures of Gaussian distributions,
                 thereby avoiding expensive computation of integrals.
                 The use of such approximations enables us to speed up
                 evaluation of the likelihood of candidate programs by a
                 factor of 1000, and makes Markov Chain Monte Carlo
                 based search feasible. We have implemented our
                 algorithm in a tool called PSKETCH, and our results are
                 encouraging PSKETCH is able to automatically synthesize
                 16 non-trivial real-world probabilistic programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Barowy:2015:FER,
  author =       "Daniel W. Barowy and Sumit Gulwani and Ted Hart and
                 Benjamin Zorn",
  title =        "{FlashRelate}: extracting relational data from
                 semi-structured spreadsheets using examples",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "218--228",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737952",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "With hundreds of millions of users, spreadsheets are
                 one of the most important end-user applications.
                 Spreadsheets are easy to use and allow users great
                 flexibility in storing data. This flexibility comes at
                 a price: users often treat spreadsheets as a poor man's
                 database, leading to creative solutions for storing
                 high-dimensional data. The trouble arises when users
                 need to answer queries with their data. Data
                 manipulation tools make strong assumptions about data
                 layouts and cannot read these ad-hoc databases.
                 Converting data into the appropriate layout requires
                 programming skills or a major investment in manual
                 reformatting. The effect is that a vast amount of
                 real-world data is ``locked-in'' to a proliferation of
                 one-off formats. We introduce FlashRelate, a synthesis
                 engine that lets ordinary users extract structured
                 relational data from spreadsheets without programming.
                 Instead, users extract data by supplying examples of
                 output relational tuples. FlashRelate uses these
                 examples to synthesize a program in Flare. Flare is a
                 novel extraction language that extends regular
                 expressions with geometric constructs. An interactive
                 user interface on top of FlashRelate lets end users
                 extract data by point-and-click. We demonstrate that
                 correct Flare programs can be synthesized in seconds
                 from a small set of examples for 43 real-world
                 scenarios. Finally, our case study demonstrates
                 FlashRelate's usefulness addressing the widespread
                 problem of data trapped in corporate and government
                 formats.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Feser:2015:SDS,
  author =       "John K. Feser and Swarat Chaudhuri and Isil Dillig",
  title =        "Synthesizing data structure transformations from
                 input-output examples",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "229--239",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737977",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a method for example-guided synthesis of
                 functional programs over recursive data structures.
                 Given a set of input-output examples, our method
                 synthesizes a program in a functional language with
                 higher-order combinators like map and fold. The
                 synthesized program is guaranteed to be the simplest
                 program in the language to fit the examples. Our
                 approach combines three technical ideas: inductive
                 generalization, deduction, and enumerative search.
                 First, we generalize the input-output examples into
                 hypotheses about the structure of the target program.
                 For each hypothesis, we use deduction to infer new
                 input/output examples for the missing subexpressions.
                 This leads to a new subproblem where the goal is to
                 synthesize expressions within each hypothesis. Since
                 not every hypothesis can be realized into a program
                 that fits the examples, we use a combination of
                 best-first enumeration and deduction to search for a
                 hypothesis that meets our needs. We have implemented
                 our method in a tool called \lambda 2, and we evaluate
                 this tool on a large set of synthesis problems
                 involving lists, trees, and nested data structures. The
                 experiments demonstrate the scalability and broad scope
                 of \lambda 2. A highlight is the synthesis of a program
                 believed to be the world's earliest functional pearl.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Ziv:2015:CCC,
  author =       "Ofri Ziv and Alex Aiken and Guy Golan-Gueta and G.
                 Ramalingam and Mooly Sagiv",
  title =        "Composing concurrency control",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "240--249",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737970",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrency control poses significant challenges when
                 composing computations over multiple data-structures
                 (objects) with different concurrency-control
                 implementations. We formalize the usually desired
                 requirements (serializability, abort-safety,
                 deadlock-safety, and opacity) as well as stronger
                 versions of these properties that enable composition.
                 We show how to compose protocols satisfying these
                 properties so that the resulting combined protocol also
                 satisfies these properties. Our approach generalizes
                 well-known protocols (such as two-phase-locking and
                 two-phase-commit) and leads to new protocols. We apply
                 this theory to show how we can safely compose
                 optimistic and pessimistic concurrency control. For
                 example, we show how we can execute a transaction that
                 accesses two objects, one controlled by an STM and
                 another by locking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Zhang:2015:DPO,
  author =       "Naling Zhang and Markus Kusano and Chao Wang",
  title =        "Dynamic partial order reduction for relaxed memory
                 models",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "250--259",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737956",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Under a relaxed memory model such as TSO or PSO, a
                 concurrent program running on a shared-memory
                 multiprocessor may observe two types of nondeterminism:
                 the nondeterminism in thread scheduling and the
                 nondeterminism in store buffering. Although there is a
                 large body of work on mitigating the scheduling
                 nondeterminism during runtime verification, methods for
                 soundly mitigating the store buffering nondeterminism
                 are lacking. We propose a new dynamic partial order
                 reduction (POR) algorithm for verifying concurrent
                 programs under TSO and PSO. Our method relies on
                 modeling both types of nondeterminism in a unified
                 framework, which allows us to extend existing POR
                 techniques to TSO and PSO without overhauling the
                 verification algorithm. In addition to sound POR, we
                 also propose a buffer-bounding method for more
                 aggressively reducing the state space. We have
                 implemented our new methods in a stateless model
                 checking tool and demonstrated their effectiveness on a
                 set of multithreaded C benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Emmi:2015:MRS,
  author =       "Michael Emmi and Constantin Enea and Jad Hamza",
  title =        "Monitoring refinement via symbolic reasoning",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "260--269",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737983",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Efficient implementations of concurrent objects such
                 as semaphores, locks, and atomic collections are
                 essential to modern computing. Programming such objects
                 is error prone: in minimizing the synchronization
                 overhead between concurrent object invocations, one
                 risks the conformance to reference implementations ---
                 or in formal terms, one risks violating observational
                 refinement. Precisely testing this refinement even
                 within a single execution is intractable, limiting
                 existing approaches to executions with very few object
                 invocations. We develop scalable and effective
                 algorithms for detecting refinement violations. Our
                 algorithms are founded on incremental, symbolic
                 reasoning, and exploit foundational insights into the
                 refinement-checking problem. Our approach is sound, in
                 that we detect only actual violations, and scales far
                 beyond existing violation-detection algorithms.
                 Empirically, we find that our approach is practically
                 complete, in that we detect the violations arising in
                 actual executions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Longfield:2015:PGS,
  author =       "Stephen Longfield and Brittany Nkounkou and Rajit
                 Manohar and Ross Tate",
  title =        "Preventing glitches and short circuits in high-level
                 self-timed chip specifications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "270--279",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737967",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Self-timed chip designs are commonly specified in a
                 high-level message-passing language called CHP. This
                 language is closely related to Hoare's CSP except it
                 admits erroneous behavior due to the necessary
                 limitations of efficient hardware implementations. For
                 example, two processes sending on the same channel at
                 the same time causes glitches and short circuits in the
                 physical chip implementation. If a CHP program
                 maintains certain invariants, such as only one process
                 is sending on any given channel at a time, it can
                 guarantee an error-free execution that behaves much
                 like a CSP program would. In this paper, we present an
                 inferable effect system for ensuring that these
                 invariants hold, drawing from model-checking
                 methodologies while exploiting language-usage patterns
                 and domain-specific specializations to achieve
                 efficiency. This analysis is sound, and is even
                 complete for the common subset of CHP programs without
                 data-sensitive synchronization. We have implemented the
                 analysis and demonstrated that it scales to validate
                 even microprocessors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Lal:2015:DID,
  author =       "Akash Lal and Shaz Qadeer",
  title =        "{DAG} inlining: a decision procedure for
                 reachability-modulo-theories in hierarchical programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "280--290",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737987",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A hierarchical program is one with multiple procedures
                 but no loops or recursion. This paper studies the
                 problem of deciding reachability queries in
                 hierarchical programs where individual statements can
                 be encoded in a decidable logic (say in SMT). This
                 problem is fundamental to verification and most
                 directly applicable to doing bounded reachability in
                 programs, i.e., reachability under a bound on the
                 number of loop iterations and recursive calls. The
                 usual method of deciding reachability in hierarchical
                 programs is to first inline all procedures and then do
                 reachability on the resulting single-procedure program.
                 Such inlining unfolds the call graph of the program to
                 a tree and may lead to an exponential increase in the
                 size of the program. We design and evaluate a method
                 called DAG inlining that unfolds the call graph to a
                 directed acyclic graph (DAG) instead of a tree by
                 sharing the bodies of procedures at certain points
                 during inlining. DAG inlining can produce much more
                 compact representations than tree inlining.
                 Empirically, we show that it leads to significant
                 improvements in the running time of a state-of-the-art
                 verifier.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Johnson:2015:EES,
  author =       "Andrew Johnson and Lucas Waye and Scott Moore and
                 Stephen Chong",
  title =        "Exploring and enforcing security guarantees via
                 program dependence graphs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "291--302",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737957",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present PIDGIN, a program analysis and
                 understanding tool that enables the specification and
                 enforcement of precise application-specific information
                 security guarantees. PIDGIN also allows developers to
                 interactively explore the information flows in their
                 applications to develop policies and investigate
                 counter-examples. PIDGIN combines program dependence
                 graphs (PDGs), which precisely capture the information
                 flows in a whole application, with a custom PDG query
                 language. Queries express properties about the paths in
                 the PDG; because paths in the PDG correspond to
                 information flows in the application, queries can be
                 used to specify global security policies. PIDGIN is
                 scalable. Generating a PDG for a 330k line Java
                 application takes 90 seconds, and checking a policy on
                 that PDG takes under 14 seconds. The query language is
                 expressive, supporting a large class of precise,
                 application-specific security guarantees. Policies are
                 separate from the code and do not interfere with
                 testing or development, and can be used for security
                 regression testing. We describe the design and
                 implementation of PIDGIN and report on using it: (1) to
                 explore information security guarantees in legacy
                 programs; (2) to develop and modify security policies
                 concurrently with application development; and (3) to
                 develop policies based on known vulnerabilities.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Singh:2015:MNP,
  author =       "Gagandeep Singh and Markus P{\"u}schel and Martin
                 Vechev",
  title =        "Making numerical program analysis fast",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "303--313",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738000",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Numerical abstract domains are a fundamental component
                 in modern static program analysis and are used in a
                 wide range of scenarios (e.g. computing array bounds,
                 disjointness, etc). However, analysis with these
                 domains can be very expensive, deeply affecting the
                 scalability and practical applicability of the static
                 analysis. Hence, it is critical to ensure that these
                 domains are made highly efficient. In this work, we
                 present a complete approach for optimizing the
                 performance of the Octagon numerical abstract domain, a
                 domain shown to be particularly effective in practice.
                 Our optimization approach is based on two key insights:
                 (i) the ability to perform online decomposition of the
                 octagons leading to a massive reduction in operation
                 counts, and (ii) leveraging classic performance
                 optimizations from linear algebra such as
                 vectorization, locality of reference, scalar
                 replacement and others, for improving the key
                 bottlenecks of the domain. Applying these ideas, we
                 designed new algorithms for the core Octagon operators
                 with better asymptotic runtime than prior work and
                 combined them with the optimization techniques to
                 achieve high actual performance. We implemented our
                 approach in the Octagon operators exported by the
                 popular APRON C library, thus enabling existing static
                 analyzers using APRON to immediately benefit from our
                 work. To demonstrate the performance benefits of our
                 approach, we evaluated our framework on three published
                 static analyzers showing massive speed-ups for the time
                 spent in Octagon analysis (e.g., up to 146x) as well as
                 significant end-to-end program analysis speed-ups (up
                 to 18.7x). Based on these results, we believe that our
                 framework can serve as a new basis for static analysis
                 with the Octagon numerical domain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Weijiang:2015:TDA,
  author =       "Yusheng Weijiang and Shruthi Balakrishna and Jianqiao
                 Liu and Milind Kulkarni",
  title =        "Tree dependence analysis",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "314--325",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737972",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We develop a new framework for analyzing recursive
                 methods that perform traversals over trees, called tree
                 dependence analysis. This analysis translates
                 dependence analysis techniques for regular programs to
                 the irregular space, identifying the structure of
                 dependences within a recursive method that traverses
                 trees. We develop a dependence test that exploits the
                 dependence structure of such programs, and can prove
                 that several locality- and parallelism- enhancing
                 transformations are legal. In addition, we extend our
                 analysis with a novel path-dependent, conditional
                 analysis to refine the dependence test and prove the
                 legality of transformations for a wider range of
                 algorithms. We then use these analyses to show that
                 several common algorithms that manipulate trees
                 recursively are amenable to several locality- and
                 parallelism-enhancing transformations. This work shows
                 that classical dependence analysis techniques, which
                 have largely been confined to nested loops over array
                 data structures, can be extended and translated to work
                 for complex, recursive programs that operate over
                 pointer-based data structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Kang:2015:FCM,
  author =       "Jeehoon Kang and Chung-Kil Hur and William Mansky and
                 Dmitri Garbuzov and Steve Zdancewic and Viktor
                 Vafeiadis",
  title =        "A formal {C} memory model supporting integer-pointer
                 casts",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "326--335",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738005",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The ISO C standard does not specify the semantics of
                 many valid programs that use non-portable idioms such
                 as integer-pointer casts. Recent efforts at formal
                 definitions and verified implementation of the C
                 language inherit this feature. By adopting high-level
                 abstract memory models, they validate common
                 optimizations. On the other hand, this prevents
                 reasoning about much low-level code relying on the
                 behavior of common implementations, where formal
                 verification has many applications. We present the
                 first formal memory model that allows many common
                 optimizations and fully supports operations on the
                 representation of pointers. All arithmetic operations
                 are well-defined for pointers that have been cast to
                 integers. Crucially, our model is also simple to
                 understand and program with. All our results are fully
                 formalized in Coq.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Hathhorn:2015:DUC,
  author =       "Chris Hathhorn and Chucky Ellison and Grigore Rosu",
  title =        "Defining the undefinedness of {C}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "336--345",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737979",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a ``negative'' semantics of the C11
                 language---a semantics that does not just give meaning
                 to correct programs, but also rejects undefined
                 programs. We investigate undefined behavior in C and
                 discuss the techniques and special considerations
                 needed for formally specifying it. We have used these
                 techniques to modify and extend a semantics of C into
                 one that captures undefined behavior. The amount of
                 semantic infrastructure and effort required to achieve
                 this was unexpectedly high, in the end nearly doubling
                 the size of the original semantics. From our semantics,
                 we have automatically extracted an undefinedness
                 checker, which we evaluate against other popular
                 analysis tools, using our own test suite in addition to
                 a third-party test suite. Our checker is capable of
                 detecting examples of all 77 categories of core
                 language undefinedness appearing in the C11 standard,
                 more than any other tool we considered. Based on this
                 evaluation, we argue that our work is the most
                 comprehensive and complete semantic treatment of
                 undefined behavior in C, and thus of the C language
                 itself.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Park:2015:KCF,
  author =       "Daejun Park and Andrei Stefanescu and Grigore Rosu",
  title =        "{KJS}: a complete formal semantics of {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "346--356",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737991",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents KJS, the most complete and
                 thoroughly tested formal semantics of JavaScript to
                 date. Being executable, KJS has been tested against the
                 ECMAScript 5.1 conformance test suite, and passes all
                 2,782 core language tests. Among the existing
                 implementations of JavaScript, only Chrome V8's passes
                 all the tests, and no other semantics passes more than
                 90\%. In addition to a reference implementation for
                 JavaScript, KJS also yields a simple coverage metric
                 for a test suite: the set of semantic rules it
                 exercises. Our semantics revealed that the ECMAScript
                 5.1 conformance test suite fails to cover several
                 semantic rules. Guided by the semantics, we wrote tests
                 to exercise those rules. The new tests revealed bugs
                 both in production JavaScript engines (Chrome V8,
                 Safari WebKit, Firefox SpiderMonkey) and in other
                 semantics. KJS is symbolically executable, thus it can
                 be used for formal analysis and verification of
                 JavaScript programs. We verified non-trivial programs
                 and found a known security vulnerability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Wilcox:2015:VFI,
  author =       "James R. Wilcox and Doug Woos and Pavel Panchekha and
                 Zachary Tatlock and Xi Wang and Michael D. Ernst and
                 Thomas Anderson",
  title =        "{Verdi}: a framework for implementing and formally
                 verifying distributed systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "357--368",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737958",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Distributed systems are difficult to implement
                 correctly because they must handle both concurrency and
                 failures: machines may crash at arbitrary points and
                 networks may reorder, drop, or duplicate packets.
                 Further, their behavior is often too complex to permit
                 exhaustive testing. Bugs in these systems have led to
                 the loss of critical data and unacceptable service
                 outages. We present Verdi, a framework for implementing
                 and formally verifying distributed systems in Coq.
                 Verdi formalizes various network semantics with
                 different faults, and the developer chooses the most
                 appropriate fault model when verifying their
                 implementation. Furthermore, Verdi eases the
                 verification burden by enabling the developer to first
                 verify their system under an idealized fault model,
                 then transfer the resulting correctness guarantees to a
                 more realistic fault model without any additional proof
                 burden. To demonstrate Verdi's utility, we present the
                 first mechanically checked proof of linearizability of
                 the Raft state machine replication algorithm, as well
                 as verified implementations of a primary-backup
                 replication system and a key-value store. These
                 verified systems provide similar performance to
                 unverified equivalents.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Olivo:2015:SDA,
  author =       "Oswaldo Olivo and Isil Dillig and Calvin Lin",
  title =        "Static detection of asymptotic performance bugs in
                 collection traversals",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "369--378",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737966",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper identifies and formalizes a prevalent class
                 of asymptotic performance bugs called redundant
                 traversal bugs and presents a novel static analysis for
                 automatically detecting them. We evaluate our technique
                 by implementing it in a tool called CLARITY and
                 applying it to widely-used software packages such as
                 the Google Core Collections Library, the Apache Common
                 Collections, and the Apache Ant build tool. Across 1.6M
                 lines of Java code, CLARITY finds 92 instances of
                 redundant traversal bugs, including 72 that have never
                 been previously reported, with just 5 false positives.
                 To evaluate the performance impact of these bugs, we
                 manually repair these programs and find that for an
                 input size of 50,000, all repaired programs are at
                 least 2.45 faster than their original code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Ding:2015:AAC,
  author =       "Yufei Ding and Jason Ansel and Kalyan Veeramachaneni
                 and Xipeng Shen and Una-May O'Reilly and Saman
                 Amarasinghe",
  title =        "Autotuning algorithmic choice for input sensitivity",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "379--390",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737969",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A daunting challenge faced by program performance
                 autotuning is input sensitivity, where the best
                 autotuned configuration may vary with different input
                 sets. This paper presents a novel two-level input
                 learning algorithm to tackle the challenge for an
                 important class of autotuning problems, algorithmic
                 autotuning. The new approach uses a two-level input
                 clustering method to automatically refine input
                 grouping, feature selection, and classifier
                 construction. Its design solves a series of open issues
                 that are particularly essential to algorithmic
                 autotuning, including the enormous optimization space,
                 complex influence by deep input features, high cost in
                 feature extraction, and variable accuracy of
                 algorithmic choices. Experimental results show that the
                 new solution yields up to a 3x speedup over using a
                 single configuration for all inputs, and a 34x speedup
                 over a traditional one-level method for addressing
                 input sensitivity in program optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Mendis:2015:HLH,
  author =       "Charith Mendis and Jeffrey Bosboom and Kevin Wu and
                 Shoaib Kamil and Jonathan Ragan-Kelley and Sylvain
                 Paris and Qin Zhao and Saman Amarasinghe",
  title =        "Helium: lifting high-performance stencil kernels from
                 stripped x86 binaries to halide {DSL} code",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "391--402",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737974",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Highly optimized programs are prone to bit rot, where
                 performance quickly becomes suboptimal in the face of
                 new hardware and compiler techniques. In this paper we
                 show how to automatically lift performance-critical
                 stencil kernels from a stripped x86 binary and generate
                 the corresponding code in the high-level
                 domain-specific language Halide. Using Halide's
                 state-of-the-art optimizations targeting current
                 hardware, we show that new optimized versions of these
                 kernels can replace the originals to rejuvenate the
                 application for newer hardware. The original optimized
                 code for kernels in stripped binaries is nearly
                 impossible to analyze statically. Instead, we rely on
                 dynamic traces to regenerate the kernels. We perform
                 buffer structure reconstruction to identify input,
                 intermediate and output buffer shapes. We abstract from
                 a forest of concrete dependency trees which contain
                 absolute memory addresses to symbolic trees suitable
                 for high-level code generation. This is done by
                 canonicalizing trees, clustering them based on
                 structure, inferring higher-dimensional buffer accesses
                 and finally by solving a set of linear equations based
                 on buffer accesses to lift them up to simple,
                 high-level expressions. Helium can handle highly
                 optimized, complex stencil kernels with input-dependent
                 conditionals. We lift seven kernels from Adobe
                 Photoshop giving a 75\% performance improvement, four
                 kernels from IrfanView, leading to 4.97$ \times $
                 performance, and one stencil from the miniGMG multigrid
                 benchmark netting a 4.25$ \times $ improvement in
                 performance. We manually rejuvenated Photoshop by
                 replacing eleven of Photoshop's filters with our lifted
                 implementations, giving 1.12$ \times $ speedup without
                 affecting the user experience.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Bowman:2015:PGM,
  author =       "William J. Bowman and Swaha Miller and Vincent
                 St-Amour and R. Kent Dybvig",
  title =        "Profile-guided meta-programming",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "403--412",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737990",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Contemporary compiler systems such as GCC, .NET, and
                 LLVM incorporate profile-guided optimizations (PGOs) on
                 low-level intermediate code and basic blocks, with
                 impressive results over purely static heuristics.
                 Recent work shows that profile information is also
                 useful for performing source-to-source optimizations
                 via meta-programming. For example, using profiling
                 information to inform decisions about data structures
                 and algorithms can potentially lead to asymptotic
                 improvements in performance. We present a design for
                 profile-guided meta-programming in a general-purpose
                 meta-programming system. Our design is parametric over
                 the particular profiler and meta-programming system. We
                 implement this design in two different meta-programming
                 systems---the syntactic extensions systems of Chez
                 Scheme and Racket---and provide several profile-guided
                 meta-programs as usability case studies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Sivaramakrishnan:2015:DPE,
  author =       "KC Sivaramakrishnan and Gowtham Kaki and Suresh
                 Jagannathan",
  title =        "Declarative programming over eventually consistent
                 data stores",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "413--424",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737981",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "User-facing online services utilize geo-distributed
                 data stores to minimize latency and tolerate partial
                 failures, with the intention of providing a fast,
                 always-on experience. However, geo-distribution does
                 not come for free; application developers have to
                 contend with weak consistency behaviors, and the lack
                 of abstractions to composably construct high-level
                 replicated data types, necessitating the need for
                 complex application logic and invariably exposing
                 inconsistencies to the user. Some commercial
                 distributed data stores and several academic proposals
                 provide a lattice of consistency levels, with stronger
                 consistency guarantees incurring increased latency and
                 throughput costs. However, correctly assigning the
                 right consistency level for an operation requires
                 subtle reasoning and is often an error-prone task. In
                 this paper, we present QUELEA, a declarative
                 programming model for eventually consistent data stores
                 (ECDS), equipped with a contract language, capable of
                 specifying fine-grained application --- level
                 consistency properties. A contract enforcement system
                 analyses contracts, and automatically generates the
                 appropriate consistency protocol for the method
                 protected by the contract. We describe an
                 implementation of QUELEA on top of an off-the-shelf
                 ECDS that provides support for coordination-free
                 transactions. Several benchmarks including two large
                 web applications, illustrate the effectiveness of our
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Siek:2015:BCT,
  author =       "Jeremy Siek and Peter Thiemann and Philip Wadler",
  title =        "Blame and coercion: together again for the first
                 time",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "425--435",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737968",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "C\#, Dart, Pyret, Racket, TypeScript, VB: many recent
                 languages integrate dynamic and static types via
                 gradual typing. We systematically develop three calculi
                 for gradual typing and the relations between them,
                 building on and strengthening previous work. The
                 calculi are: \lambda B, based on the blame calculus of
                 Wadler and Findler (2009); \lambda C, inspired by the
                 coercion calculus of Henglein (1994); \lambda S
                 inspired by the space-efficient calculus of Herman,
                 Tomb, and Flanagan (2006) and the threesome calculus of
                 Siek and Wadler (2010). While \lambda B is little
                 changed from previous work, \lambda C and \lambda S are
                 new. Together, \lambda B, \lambda C, and \lambda S
                 provide a coherent foundation for design,
                 implementation, and optimisation of gradual types. We
                 define translations from \lambda B to \lambda C and
                 from \lambda C to \lambda S. Much previous work lacked
                 proofs of correctness or had weak correctness criteria;
                 here we demonstrate the strongest correctness criterion
                 one could hope for, that each of the translations is
                 fully abstract. Each of the calculi reinforces the
                 design of the others: \lambda C has a particularly
                 simple definition, and the subtle definition of blame
                 safety for \lambda B is justified by the simple
                 definition of blame safety for \lambda C. Our calculus
                 \lambda S is implementation-ready: the first
                 space-efficient calculus that is both straightforward
                 to implement and easy to understand. We give two
                 applications: first, using full abstraction from
                 \lambda C to \lambda S to validate the challenging part
                 of full abstraction between \lambda B and \lambda C;
                 and, second, using full abstraction from \lambda B to
                 \lambda S to easily establish the Fundamental Property
                 of Casts, which required a custom bisimulation and six
                 lemmas in earlier work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Zhang:2015:LFO,
  author =       "Yizhou Zhang and Matthew C. Loring and Guido
                 Salvaneschi and Barbara Liskov and Andrew C. Myers",
  title =        "Lightweight, flexible object-oriented generics",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "436--445",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738008",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The support for generic programming in modern
                 object-oriented programming languages is awkward and
                 lacks desirable expressive power. We introduce an
                 expressive genericity mechanism that adds expressive
                 power and strengthens static checking, while remaining
                 lightweight and simple in common use cases. Like type
                 classes and concepts, the mechanism allows existing
                 types to model type constraints retroactively. For
                 expressive power, we expose models as named constructs
                 that can be defined and selected explicitly to witness
                 constraints; in common uses of genericity, however,
                 types implicitly witness constraints without additional
                 programmer effort. Models are integrated into the
                 object-oriented style, with features like model
                 generics, model-dependent types, model enrichment,
                 model multimethods, constraint entailment, model
                 inheritance, and existential quantification further
                 extending expressive power in an object-oriented
                 setting. We introduce the new genericity features and
                 show that common generic programming idioms, including
                 current generic libraries, can be expressed more
                 precisely and concisely. The static semantics of the
                 mechanism and a proof of a key decidability property
                 can be found in an associated technical report.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Nguyen:2015:RCC,
  author =       "Ph{\'u}c C. Nguy{\v{e}}n and David {Van Horn}",
  title =        "Relatively complete counterexamples for higher-order
                 programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "446--456",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737971",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we study the problem of generating
                 inputs to a higher-order program causing it to error.
                 We first approach the problem in the setting of PCF, a
                 typed, core functional language and contribute the
                 first relatively complete method for constructing
                 counterexamples for PCF programs. The method is
                 relatively complete with respect to a first-order
                 solver over the base types of PCF. In practice, this
                 means an SMT solver can be used for the effective,
                 automated generation of higher-order counterexamples
                 for a large class of programs. We achieve this result
                 by employing a novel form of symbolic execution for
                 higher-order programs. The remarkable aspect of this
                 symbolic execution is that even though symbolic
                 higher-order inputs and values are considered, the path
                 condition remains a first-order formula. Our handling
                 of symbolic function application enables the
                 reconstruction of higher-order counterexamples from
                 this first-order formula. After establishing our main
                 theoretical results, we sketch how to apply the
                 approach to untyped, higher-order, stateful languages
                 with first-class contracts and show how counterexample
                 generation can be used to detect contract violations in
                 this setting. To validate our approach, we implement a
                 tool generating counterexamples for erroneous modules
                 written in Racket.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Chu:2015:AIP,
  author =       "Duc-Hiep Chu and Joxan Jaffar and Minh-Thai Trinh",
  title =        "Automatic induction proofs of data-structures in
                 imperative programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "457--466",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737984",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We consider the problem of automated reasoning about
                 dynamically manipulated data structures. Essential
                 properties are encoded as predicates whose definitions
                 are formalized via user-defined recursive rules.
                 Traditionally, proving relationships between such
                 properties is limited to the unfold-and-match (U+M)
                 paradigm which employs systematic transformation steps
                 of folding/unfolding the rules. A proof, using U+M,
                 succeeds when we find a sequence of transformations
                 that produces a final formula which is obviously
                 provable by simply matching terms. Our contribution
                 here is the addition of the fundamental principle of
                 induction to this automated process. We first show that
                 some proof obligations that are dynamically generated
                 in the process can be used as induction hypotheses in
                 the future, and then we show how to use these
                 hypotheses in an induction step which generates a new
                 proof obligation aside from those obtained by using the
                 fold/unfold operations. While the adding of induction
                 is an obvious need in general, no automated method has
                 managed to include this in a systematic and general
                 way. The main reason for this is the problem of
                 avoiding circular reasoning. We overcome this with a
                 novel checking condition. In summary, our contribution
                 is a proof method which --- beyond U+M --- performs
                 automatic formula re-writing by treating previously
                 encountered obligations in each proof path as possible
                 induction hypotheses. In the practical evaluation part
                 of this paper, we show how the commonly used technique
                 of using unproven lemmas can be avoided, using
                 realistic benchmarks. This not only removes the current
                 burden of coming up with the appropriate lemmas, but
                 also significantly boosts up the verification process,
                 since lemma applications, coupled with unfolding, often
                 induce a large search space. In the end, our method can
                 automatically reason about a new class of formulas
                 arising from practical program verification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Carbonneaux:2015:CCR,
  author =       "Quentin Carbonneaux and Jan Hoffmann and Zhong Shao",
  title =        "Compositional certified resource bounds",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "467--478",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737955",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a new approach for automatically
                 deriving worst-case resource bounds for C programs. The
                 described technique combines ideas from amortized
                 analysis and abstract interpretation in a unified
                 framework to address four challenges for
                 state-of-the-art techniques: compositionality, user
                 interaction, generation of proof certificates, and
                 scalability. Compositionality is achieved by
                 incorporating the potential method of amortized
                 analysis. It enables the derivation of global
                 whole-program bounds with local derivation rules by
                 naturally tracking size changes of variables in
                 sequenced loops and function calls. The resource
                 consumption of functions is described abstractly and a
                 function call can be analyzed without access to the
                 function body. User interaction is supported with a new
                 mechanism that clearly separates qualitative and
                 quantitative verification. A user can guide the
                 analysis to derive complex non-linear bounds by using
                 auxiliary variables and assertions. The assertions are
                 separately proved using established qualitative
                 techniques such as abstract interpretation or Hoare
                 logic. Proof certificates are automatically generated
                 from the local derivation rules. A soundness proof of
                 the derivation system with respect to a formal cost
                 semantics guarantees the validity of the certificates.
                 Scalability is attained by an efficient reduction of
                 bound inference to a linear optimization problem that
                 can be solved by off-the-shelf LP solvers. The analysis
                 framework is implemented in the publicly-available tool
                 C4B. An experimental evaluation demonstrates the
                 advantages of the new technique with a comparison of
                 C4B with existing tools on challenging micro benchmarks
                 and the analysis of more than 2900 lines of C code from
                 the cBench benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Crary:2015:PPA,
  author =       "Karl Crary and Michael J. Sullivan",
  title =        "Peer-to-peer affine commitment using bitcoin",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "479--488",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737997",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The power of linear and affine logic lies in their
                 ability to model state change. However, in a trustless,
                 peer-to-peer setting, it is difficult to force
                 principals to commit to state changes. We show how to
                 solve the peer-to-peer affine commitment problem using
                 a generalization of Bitcoin in which transactions deal
                 in types rather than numbers. This has applications to
                 proof-carrying authorization and mechanically
                 executable contracts. Importantly, our system can
                 be---and is---implemented on top of the existing
                 Bitcoin network, so there is no need to recruit
                 computing power to a new protocol.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Le:2015:TNT,
  author =       "Ton Chanh Le and Shengchao Qin and Wei-Ngan Chin",
  title =        "Termination and non-termination specification
                 inference",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "489--498",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737993",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Techniques for proving termination and non-termination
                 of imperative programs are usually considered as
                 orthogonal mechanisms. In this paper, we propose a
                 novel mechanism that analyzes and proves both program
                 termination and non-termination at the same time. We
                 first introduce the concept of second-order termination
                 constraints and accumulate a set of relational
                 assumptions on them via a Hoare-style verification. We
                 then solve these assumptions with case analysis to
                 determine the (conditional) termination and
                 non-termination scenarios expressed in some
                 specification logic form. In contrast to current
                 approaches, our technique can construct a summary of
                 terminating and non-terminating behaviors for each
                 method. This enables modularity and reuse for our
                 termination and non-termination proving processes. We
                 have tested our tool on sample programs from a recent
                 termination competition, and compared favorably against
                 state-of-the-art termination analyzers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Emani:2015:CDM,
  author =       "Murali Krishna Emani and Michael O'Boyle",
  title =        "Celebrating diversity: a mixture of experts approach
                 for runtime mapping in dynamic environments",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "499--508",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737999",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Matching program parallelism to platform parallelism
                 using thread selection is difficult when the
                 environment and available resources dynamically change.
                 Existing compiler or runtime approaches are typically
                 based on a one-size fits all policy. There is little
                 ability to either evaluate or adapt the policy when
                 encountering new external workloads or hardware
                 resources. This paper focuses on selecting the best
                 number of threads for a parallel application in dynamic
                 environments. It develops a new scheme based on a
                 mixture of experts approach. It learns online which, of
                 a number of existing policies, or experts, is best
                 suited for a particular environment without having to
                 try out each policy. It does this by using a novel
                 environment predictor as a proxy for the quality of an
                 expert thread selection policy. Additional expert
                 policies can easily be added and are selected only when
                 appropriate. We evaluate our scheme in environments
                 with varying external workloads and hardware
                 resources.We then consider the case when workloads use
                 affinity scheduling or are themselves adaptive and show
                 that our approach, in all cases, outperforms existing
                 schemes and surprisingly improves workload performance.
                 On average, we improve 1.66x over OpenMP default, 1.34x
                 over an online scheme, 1.25x over an offline policy and
                 1.2x over a state-of-art analytic model. Determining
                 the right number and type of experts is an open problem
                 and our initial analysis shows that adding more experts
                 improves accuracy and performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Ren:2015:EER,
  author =       "Bin Ren and Youngjoon Jo and Sriram Krishnamoorthy and
                 Kunal Agrawal and Milind Kulkarni",
  title =        "Efficient execution of recursive programs on commodity
                 vector hardware",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "509--520",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738004",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The pursuit of computational efficiency has led to the
                 proliferation of throughput-oriented hardware, from
                 GPUs to increasingly wide vector units on commodity
                 processors and accelerators. This hardware is designed
                 to efficiently execute data-parallel computations in a
                 vectorized manner. However, many algorithms are more
                 naturally expressed as divide-and-conquer, recursive,
                 task-parallel computations. In the absence of data
                 parallelism, it seems that such algorithms are not well
                 suited to throughput-oriented architectures. This paper
                 presents a set of novel code transformations that
                 expose the data parallelism latent in recursive,
                 task-parallel programs. These transformations
                 facilitate straightforward vectorization of
                 task-parallel programs on commodity hardware. We also
                 present scheduling policies that maintain high
                 utilization of vector resources while limiting space
                 usage. Across several task-parallel benchmarks, we
                 demonstrate both efficient vector resource utilization
                 and substantial speedup on chips using Intel's SSE4.2
                 vector units, as well as accelerators using Intel's
                 AVX512 units.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Venkat:2015:LDT,
  author =       "Anand Venkat and Mary Hall and Michelle Strout",
  title =        "Loop and data transformations for sparse matrix code",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "521--532",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738003",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces three new compiler
                 transformations for representing and transforming
                 sparse matrix computations and their data
                 representations. In cooperation with run-time
                 inspection, our compiler derives transformed matrix
                 representations and associated transformed code to
                 implement a variety of representations targeting
                 different architecture platforms. This systematic
                 approach to combining code and data transformations on
                 sparse computations, which extends a polyhedral
                 transformation and code generation framework, permits
                 the compiler to compose these transformations with
                 other transformations to generate code that is on
                 average within 5\% and often exceeds manually-tuned,
                 high-performance sparse matrix libraries CUSP and OSKI.
                 Additionally, the compiler-generated inspector codes
                 are on average 1.5 faster than OSKI and perform
                 comparably to CUSP, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Prountzos:2015:SPG,
  author =       "Dimitrios Prountzos and Roman Manevich and Keshav
                 Pingali",
  title =        "Synthesizing parallel graph programs via automated
                 planning",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "533--544",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737953",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a system that uses automated planning to
                 synthesize correct and efficient parallel graph
                 programs from high-level algorithmic specifications.
                 Automated planning allows us to use constraints to
                 declaratively encode program transformations such as
                 scheduling, implementation selection, and insertion of
                 synchronization. Each plan emitted by the planner
                 satisfies all constraints simultaneously, and
                 corresponds to a composition of these transformations.
                 In this way, we obtain an integrated compilation
                 approach for a very challenging problem domain. We have
                 used this system to synthesize parallel programs for
                 four graph problems: triangle counting, maximal
                 independent set computation, preflow-push maxflow, and
                 connected components. Experiments on a variety of
                 inputs show that the synthesized implementations
                 perform competitively with hand-written, highly-tuned
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Marr:2015:ZOM,
  author =       "Stefan Marr and Chris Seaton and St{\'e}phane
                 Ducasse",
  title =        "Zero-overhead metaprogramming: reflection and
                 metaobject protocols fast and without compromises",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "545--554",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737963",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Runtime metaprogramming enables many useful
                 applications and is often a convenient solution to
                 solve problems in a generic way, which makes it widely
                 used in frameworks, middleware, and domain-specific
                 languages. However, powerful metaobject protocols are
                 rarely supported and even common concepts such as
                 reflective method invocation or dynamic proxies are not
                 optimized. Solutions proposed in literature either
                 restrict the metaprogramming capabilities or require
                 application or library developers to apply performance
                 improving techniques. For overhead-free runtime
                 metaprogramming, we demonstrate that dispatch chains, a
                 generalized form of polymorphic inline caches common to
                 self-optimizing interpreters, are a simple optimization
                 at the language-implementation level. Our evaluation
                 with self-optimizing interpreters shows that
                 unrestricted metaobject protocols can be realized for
                 the first time without runtime overhead, and that this
                 optimization is applicable for just-in-time compilation
                 of interpreters based on meta-tracing as well as
                 partial evaluation. In this context, we also
                 demonstrate that optimizing common reflective
                 operations can lead to significant performance
                 improvements for existing applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Isradisaikul:2015:FCP,
  author =       "Chinawat Isradisaikul and Andrew C. Myers",
  title =        "Finding counterexamples from parsing conflicts",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "555--564",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737961",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Writing a parser remains remarkably painful. Automatic
                 parser generators offer a powerful and systematic way
                 to parse complex grammars, but debugging conflicts in
                 grammars can be time-consuming even for experienced
                 language designers. Better tools for diagnosing parsing
                 conflicts will alleviate this difficulty. This paper
                 proposes a practical algorithm that generates compact,
                 helpful counterexamples for LALR grammars. For each
                 parsing conflict in a grammar, a counterexample
                 demonstrating the conflict is constructed. When the
                 grammar in question is ambiguous, the algorithm usually
                 generates a compact counterexample illustrating the
                 ambiguity. This algorithm has been implemented as an
                 extension to the CUP parser generator. The results from
                 applying this implementation to a diverse collection of
                 faulty grammars show that the algorithm is practical,
                 effective, and suitable for inclusion in other LALR
                 parser generators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Leung:2015:IPS,
  author =       "Alan Leung and John Sarracino and Sorin Lerner",
  title =        "Interactive parser synthesis by example",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "565--574",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738002",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite decades of research on parsing, the
                 construction of parsers remains a painstaking, manual
                 process prone to subtle bugs and pitfalls. We present a
                 programming-by-example framework called Parsify that is
                 able to synthesize a parser from input/output examples.
                 The user does not write a single line of code. To
                 achieve this, Parsify provides: (a) an iterative
                 algorithm for synthesizing and refining a grammar one
                 example at a time, (b) an interface that provides
                 immediate visual feedback in response to changes in the
                 grammar being refined, and (c) a graphical mechanism
                 for specifying example parse trees using only textual
                 selections. We empirically demonstrate the viability of
                 our approach by using Parsify to construct parsers for
                 source code drawn from Verilog, SQL, Apache, and
                 Tiger.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Lucia:2015:SSP,
  author =       "Brandon Lucia and Benjamin Ransford",
  title =        "A simpler, safer programming and execution model for
                 intermittent systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "575--585",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737978",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy harvesting enables novel devices and
                 applications without batteries, but intermittent
                 operation under energy harvesting poses new challenges
                 to memory consistency that threaten to leave
                 applications in failed states not reachable in
                 continuous execution. This paper presents analytical
                 models that aid in reasoning about intermittence. Using
                 these, we develop DINO (Death Is Not an Option), a
                 programming and execution model that simplifies
                 programming for intermittent systems and ensures
                 volatile and nonvolatile data consistency despite
                 near-constant interruptions. DINO is the first system
                 to address these consistency problems in the context of
                 intermittent execution. We evaluate DINO on three
                 energy-harvesting hardware platforms running different
                 applications. The applications fail and exhibit error
                 without DINO, but run correctly with DINO's modest
                 1.8-2.7$ \times $ run-time overhead. DINO also
                 dramatically simplifies programming, reducing the set
                 of possible failure-related control transfers by 5--9$
                 \times $.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Machado:2015:CDD,
  author =       "Nuno Machado and Brandon Lucia and Lu{\'\i}s
                 Rodrigues",
  title =        "Concurrency debugging with differential schedule
                 projections",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "586--595",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737973",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Symbiosis: a concurrency debugging
                 technique based on novel differential schedule
                 projections (DSPs). A DSP shows the small set of memory
                 operations and data-flows responsible for a failure, as
                 well as a reordering of those elements that avoids the
                 failure. To build a DSP, Symbiosis first generates a
                 full, failing, multithreaded schedule via thread path
                 profiling and symbolic constraint solving. Symbiosis
                 selectively reorders events in the failing schedule to
                 produce a non-failing, alternate schedule. A DSP
                 reports the ordering and data-flow differences between
                 the failing and non-failing schedules. Our evaluation
                 on buggy real-world software and benchmarks shows that,
                 in practical time, Symbiosis generates DSPs that both
                 isolate the small fraction of event orders and
                 data-flows responsible for the failure, and show which
                 event reorderings prevent failing. In our experiments,
                 DSPs contain 81\% fewer events and 96\% less data-flows
                 than the full failure-inducing schedules. Moreover, by
                 allowing developers to focus on only a few events, DSPs
                 reduce the amount of time required to find a valid
                 fix.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Srinivasan:2015:SMC,
  author =       "Venkatesh Srinivasan and Thomas Reps",
  title =        "Synthesis of machine code from semantics",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "596--607",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737960",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we present a technique to synthesize
                 machine-code instructions from a semantic
                 specification, given as a Quantifier-Free Bit-Vector
                 (QFBV) logic formula. Our technique uses an
                 instantiation of the Counter-Example Guided Inductive
                 Synthesis (CEGIS) framework, in combination with
                 search-space pruning heuristics to synthesize
                 instruction-sequences. To counter the exponential cost
                 inherent in enumerative synthesis, our technique uses a
                 divide-and-conquer strategy to break the input QFBV
                 formula into independent sub-formulas, and synthesize
                 instructions for the sub-formulas. Synthesizers created
                 by our technique could be used to create
                 semantics-based binary rewriting tools such as
                 optimizers, partial evaluators, program
                 obfuscators/de-obfuscators, etc. Our experiments for
                 Intel's IA-32 instruction set show that, in comparison
                 to our baseline algorithm, our search-space pruning
                 heuristics reduce the synthesis time by a factor of
                 473, and our divide-and-conquer strategy reduces the
                 synthesis time by a further 3 to 5 orders of
                 magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Gonnord:2015:SRF,
  author =       "Laure Gonnord and David Monniaux and Gabriel Radanne",
  title =        "Synthesis of ranking functions using extremal
                 counterexamples",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "608--618",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2737976",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a complete method for synthesizing
                 lexicographic linear ranking functions (and thus
                 proving termination), supported by inductive
                 invariants, in the case where the transition relation
                 of the program includes disjunctions and existentials
                 (large block encoding of control flow). Previous work
                 would either synthesize a ranking function at every
                 basic block head, not just loop headers, which reduces
                 the scope of programs that may be proved to be
                 terminating, or expand large block transitions
                 including tests into (exponentially many) elementary
                 transitions, prior to computing the ranking function,
                 resulting in a very large global constraint system. In
                 contrast, our algorithm incrementally refines a global
                 linear constraint system according to extremal
                 counterexamples: only constraints that exclude spurious
                 solutions are included. Experiments with our tool
                 Termite show marked performance and scalability
                 improvements compared to other systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Osera:2015:TED,
  author =       "Peter-Michael Osera and Steve Zdancewic",
  title =        "Type-and-example-directed program synthesis",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "6",
  pages =        "619--630",
  month =        jun,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2813885.2738007",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:41 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents an algorithm for synthesizing
                 recursive functions that process algebraic datatypes.
                 It is founded on proof-theoretic techniques that
                 exploit both type information and input-output examples
                 to prune the search space. The algorithm uses
                 refinement trees, a data structure that succinctly
                 represents constraints on the shape of generated code.
                 We evaluate the algorithm by using a prototype
                 implementation to synthesize more than 40 benchmarks
                 and several non-trivial larger examples. Our results
                 demonstrate that the approach meets or outperforms the
                 state-of-the-art for this domain, in terms of synthesis
                 time or attainable size of the generated programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '15 conference proceedings.",
}

@Article{Tu:2015:CIE,
  author =       "Cheng-Chun Tu and Michael Ferdman and Chao-tung Lee
                 and Tzi-cker Chiueh",
  title =        "A Comprehensive Implementation and Evaluation of
                 Direct Interrupt Delivery",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "1--15",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731189",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As the performance overhead associated with CPU and
                 memory virtualization becomes largely negligible,
                 research efforts are directed toward reducing the I/O
                 virtualization overhead, which mainly comes from two
                 sources: DMA set-up and payload copy, and interrupt
                 delivery. The advent of SRIOV and MRIOV effectively
                 reduces the DMA-related virtualization overhead to a
                 minimum. Therefore, the last battleground for
                 minimizing virtualization overhead is how to directly
                 deliver every interrupt to its target VM without
                 involving the hypervisor. This paper describes the
                 design, implementation, and evaluation of a KVM-based
                 direct interrupt delivery system called DID. DID
                 delivers interrupts from SRIOV devices, virtual
                 devices, and timers to their target VMs directly,
                 completely avoiding VM exits. Moreover, DID does not
                 require any modifications to the VM's operating system
                 and preserves the correct priority among interrupts in
                 all cases. We demonstrate that DID reduces the number
                 of VM exits by a factor of 100 for I/O-intensive
                 workloads, decreases the interrupt invocation latency
                 by 80\%, and improves the throughput of a VM running
                 Memcached by a factor of 3.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Pfefferle:2015:HVF,
  author =       "Jonas Pfefferle and Patrick Stuedi and Animesh Trivedi
                 and Bernard Metzler and Ionnis Koltsidas and Thomas R.
                 Gross",
  title =        "A Hybrid {I/O} Virtualization Framework for
                 {RDMA}-capable Network Interfaces",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "17--30",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731200",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "DMA-capable interconnects, providing ultra-low latency
                 and high bandwidth, are increasingly being used in the
                 context of distributed storage and data processing
                 systems. However, the deployment of such systems in
                 virtualized data centers is currently inhibited by the
                 lack of a flexible and high-performance virtualization
                 solution for RDMA network interfaces. In this work, we
                 present a hybrid virtualization architecture which
                 builds upon the concept of separation of paths for
                 control and data operations available in RDMA. With
                 hybrid virtualization, RDMA control operations are
                 virtualized using hypervisor involvement, while data
                 operations are set up to bypass the hypervisor
                 completely. We describe HyV (Hybrid Virtualization), a
                 virtualization framework for RDMA devices implementing
                 such a hybrid architecture. In the paper, we provide a
                 detailed evaluation of HyV for different RDMA
                 technologies and operations. We further demonstrate the
                 advantages of HyV in the context of a real distributed
                 system by running RAMCloud on a set of HyV-enabled
                 virtual machines deployed across a 6-node RDMA cluster.
                 All of the performance results we obtained illustrate
                 that hybrid virtualization enables bare-metal RDMA
                 performance inside virtual machines while retaining the
                 flexibility typically associated with
                 paravirtualization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Younge:2015:SHP,
  author =       "Andrew J. Younge and John Paul Walters and Stephen P.
                 Crago and Geoffrey C. Fox",
  title =        "Supporting High Performance Molecular Dynamics in
                 Virtualized Clusters using {IOMMU}, {SR-IOV}, and
                 {GPUDirect}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "31--38",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731194",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Cloud Infrastructure-as-a-Service paradigms have
                 recently shown their utility for a vast array of
                 computational problems, ranging from advanced web
                 service architectures to high throughput computing.
                 However, many scientific computing applications have
                 been slow to adapt to virtualized cloud frameworks.
                 This is due to performance impacts of virtualization
                 technologies, coupled with the lack of advanced
                 hardware support necessary for running many high
                 performance scientific applications at scale. By using
                 KVM virtual machines that leverage both Nvidia GPUs and
                 InfiniBand, we show that molecular dynamics simulations
                 with LAMMPS and HOOMD run at near-native speeds. This
                 experiment also illustrates how virtualized
                 environments can support the latest parallel computing
                 paradigms, including both MPI+CUDA and new GPUDirect
                 RDMA functionality. Specific findings show initial
                 promise in scaling of such applications to larger
                 production deployments targeting large scale
                 computational workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Guo:2015:PBL,
  author =       "Fei Guo and Seongbeom Kim and Yury Baskakov and Ishan
                 Banerjee",
  title =        "Proactively Breaking Large Pages to Improve Memory
                 Overcommitment Performance in {VMware ESXi}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "39--51",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731187",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "VMware ESXi leverages hardware support for MMU
                 virtualization available in modern Intel/AMD CPUs. To
                 optimize address translation performance when running
                 on such CPUs, ESXi preferably uses host large pages
                 (2MB in x86-64 systems) to back VM's guest memory.
                 While using host large pages provides best performance
                 when host has sufficient free memory, it increases host
                 memory pressure and effectively defeats page sharing.
                 Hence, the host is more likely to hit the point where
                 ESXi has to reclaim VM memory through much more
                 expensive techniques such as ballooning or host
                 swapping. As a result, using host large pages may
                 significantly hurt consolidation ratio. To deal with
                 this problem, we propose a new host large page
                 management policy that allows to: (a) identify 'cold'
                 large pages and break them even when host has plenty of
                 free memory; (b) break all large pages proactively when
                 host free memory becomes scarce, but before the host
                 starts ballooning or swapping; (c) reclaim the small
                 pages within the broken large pages through page
                 sharing. With the new policy, the shareable small pages
                 can be shared much earlier and the amount of memory
                 that needs to be ballooned or swapped can be largely
                 reduced when host memory pressure is high. We also
                 propose an algorithm to dynamically adjust the page
                 sharing rate when proactively breaking large pages
                 using a VM large page shareability estimator for higher
                 efficiency. Experimental results show that the proposed
                 large page management policy can improve the
                 performance of various workloads up to 2.1x by
                 significantly reducing the amount of ballooned or
                 swapped memory when host memory pressure is high.
                 Applications still fully benefit from host large pages
                 when memory pressure is low.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Wang:2015:HPI,
  author =       "Zhe Wang and Jianjun Li and Chenggang Wu and Dongyan
                 Yang and Zhenjiang Wang and Wei-Chung Hsu and Bin Li
                 and Yong Guan",
  title =        "{HSPT}: Practical Implementation and Efficient
                 Management of Embedded Shadow Page Tables for
                 Cross-{ISA} System Virtual Machines",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "53--64",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731188",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Cross-ISA (Instruction Set Architecture) system-level
                 virtual machine has a significant research and
                 practical value. For example, several recently
                 announced virtual smart phones for iOS which run smart
                 phone applications on x86 based PCs are deployed on
                 cross-ISA system level virtual machines. Also, for
                 mobile device application development, by emulating the
                 Android/ARM environment on the more powerful x86-64
                 platform, application development and debugging become
                 more convenient and productive. However, the
                 virtualization layer often incurs high performance
                 overhead. The key overhead comes from memory
                 virtualization where a guest virtual address (GVA) must
                 go through multi-level address translation to become a
                 host physical address (HPA). The Embedded Shadow Page
                 Table (ESPT) approach has been proposed to effectively
                 decrease this address translation cost. ESPT directly
                 maps GVA to HPA, thus avoid the lengthy guest virtual
                 to guest physical, guest physical to host virtual, and
                 host virtual to host physical address translation.
                 However, the original ESPT work has a few drawbacks.
                 For example, its implementation relies on a loadable
                 kernel module (LKM) to manage the shadow page table.
                 Using LKMs is less desirable for system virtual
                 machines due to portability, security and
                 maintainability concerns. Our work proposes a
                 different, yet more practical, implementation to
                 address the shortcomings. Instead of relying on using
                 LKMs, our approach adopts a shared memory mapping
                 scheme to maintain the shadow page table (SPT) using
                 only ''mmap'' system call. Furthermore, this work
                 studies the support of SPT for multi-processing in
                 greater details. It devices three different SPT
                 organizations and evaluates their strength and weakness
                 with standard and real Android applications on the
                 system virtual machine which emulates the Android/ARM
                 platform on x86-64 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Kehne:2015:GEO,
  author =       "Jens Kehne and Jonathan Metter and Frank Bellosa",
  title =        "{GPUswap}: Enabling Oversubscription of {GPU} Memory
                 through Transparent Swapping",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "65--77",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731192",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the last few years, GPUs have been finding their
                 way into cloud computing platforms, allowing users to
                 benefit from the performance of GPUs at low cost.
                 However, a large portion of the cloud's cost advantage
                 traditionally stems from oversubscription: Cloud
                 providers rent out more resources to their customers
                 than are actually available, expecting that the
                 customers will not actually use all of the promised
                 resources. For GPU memory, this oversubscription is
                 difficult due to the lack of support for demand paging
                 in current GPUs. Therefore, recent approaches to
                 enabling oversubscription of GPU memory resort to
                 software scheduling of GPU kernels --- which has been
                 shown to induce significant runtime overhead in
                 applications even if sufficient GPU memory is available
                 --- to ensure that data is present on the GPU when
                 referenced. In this paper, we present GPUswap, a novel
                 approach to enabling oversubscription of GPU memory
                 that does not rely on software scheduling of GPU
                 kernels. GPUswap uses the GPU's ability to access
                 system RAM directly to extend the GPU's own memory. To
                 that end, GPUswap transparently relocates data from the
                 GPU to system RAM in response to memory pressure.
                 GPUswap ensures that all data is permanently accessible
                 to the GPU and thus allows applications to submit
                 commands to the GPU directly at any time, without the
                 need for software scheduling. Experiments with our
                 prototype implementation show that GPU applications can
                 still execute even with only 20 MB of GPU memory
                 available. In addition, while software scheduling
                 suffers from permanent overhead even with sufficient
                 GPU memory available, our approach executes GPU
                 applications with native performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Gupta:2015:HER,
  author =       "Vishal Gupta and Min Lee and Karsten Schwan",
  title =        "{HeteroVisor}: Exploiting Resource Heterogeneity to
                 Enhance the Elasticity of Cloud Platforms",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "79--92",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731191",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents HeteroVisor, a heterogeneity-aware
                 hypervisor, that exploits resource heterogeneity to
                 enhance the elasticity of cloud systems. Introducing
                 the notion of 'elasticity' (E) states, HeteroVisor
                 permits applications to manage their changes in
                 resource requirements as state transitions that
                 implicitly move their execution among heterogeneous
                 platform components. Masking the details of platform
                 heterogeneity from virtual machines, the E-state
                 abstraction allows applications to adapt their resource
                 usage in a fine-grained manner via VM-specific
                 'elasticity drivers' encoding VM-desired policies. The
                 approach is explored for the heterogeneous processor
                 and memory subsystems evolving for modern server
                 platforms, leading to mechanisms that can manage these
                 heterogeneous resources dynamically and as required by
                 the different VMs being run. HeteroVisor is implemented
                 for the Xen hypervisor, with mechanisms that go beyond
                 core scaling to also deal with memory resources, via
                 the online detection of hot memory pages and
                 transparent page migration. Evaluation on an emulated
                 heterogeneous platform uses workload traces from
                 real-world data, demonstrating the ability to provide
                 high on-demand performance while also reducing resource
                 usage for these workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Wang:2015:DAA,
  author =       "Hui Wang and Canturk Isci and Lavanya Subramanian and
                 Jongmoo Choi and Depei Qian and Onur Mutlu",
  title =        "{A-DRM}: Architecture-aware Distributed Resource
                 Management of Virtualized Clusters",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "93--106",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731202",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Virtualization technologies has been widely adopted by
                 large-scale cloud computing platforms. These
                 virtualized systems employ distributed resource
                 management (DRM) to achieve high resource utilization
                 and energy savings by dynamically migrating and
                 consolidating virtual machines. DRM schemes usually use
                 operating-system-level metrics, such as CPU
                 utilization, memory capacity demand and I/O
                 utilization, to detect and balance resource contention.
                 However, they are oblivious to microarchitecture-level
                 resource interference (e.g., memory bandwidth
                 contention between different VMs running on a host),
                 which is currently not exposed to the operating system.
                 We observe that the lack of visibility into
                 microarchitecture-level resource interference
                 significantly impacts the performance of virtualized
                 systems. Motivated by this observation, we propose a
                 novel architecture-aware DRM scheme (ADRM), that takes
                 into account microarchitecture-level resource
                 interference when making migration decisions in a
                 virtualized cluster. ADRM makes use of three core
                 techniques: (1) a profiler to monitor the
                 microarchitecture-level resource usage behavior online
                 for each physical host, (2) a memory bandwidth
                 interference model to assess the interference degree
                 among virtual machines on a host, and (3) a cost-benefit
                 analysis to determine a candidate virtual machine and a
                 host for migration. Real system experiments on thirty
                 randomly selected combinations of applications from the
                 CPU2006, PARSEC, STREAM, NAS Parallel Benchmark suites
                 in a four-host virtualized cluster show that ADRM can
                 improve performance by up to 26.55\%, with an average
                 of 9.67\%, compared to traditional DRM schemes that
                 lack visibility into microarchitecture-level resource
                 utilization and contention.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Singh:2015:TVC,
  author =       "Rayman Preet Singh and Tim Brecht and S. Keshav",
  title =        "Towards {VM} Consolidation Using a Hierarchy of Idle
                 States",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "107--119",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731195",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Typical VM consolidation approaches re-pack VMs into
                 fewer physical machines, resulting in energy and cost
                 savings [13, 19, 23, 40]. Recent work has explored a
                 just-in time approach to VM consolidation by
                 transitioning VMsto an inactive state when idle and
                 activating them on the arrival of client requests[17,
                 21]. This leads to increased VM density at the cost of
                 an increase in client request latency (called miss
                 penalty ). The VM density so obtained, although
                 greater, is still limited by the number of VMs that can
                 be hosted in the one inactive state. If idle VMs were
                 hosted in multiple inactive states, VM density can be
                 increased further while ensuring small miss penalties.
                 However, VMs in different inactive states have
                 different capacities, activation times, and resource
                 requirements. Therefore, a key question is: How should
                 VMs be transitioned between different states to
                 minimize the expected miss penalty? This paper explores
                 the hosting of idle VMs in a hierarchy of multiple such
                 inactive states, and studies the effect of different
                 idle VMmanagement policies on VMdensity and miss
                 penalties. We formulate a mathematical model for the
                 problem, and provide a theoretical lower bound on the
                 miss penalty. Using an off-the-shelf virtualization
                 solution (LXC [2]), we demonstrate how the required
                 model parameters can be obtained. We evaluate a variety
                 of policies and quantify their miss penalties for
                 different VM densities. We observe that some policies
                 consolidate up to 550 VMs per machine with average miss
                 penalties smaller than 1 ms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Kyle:2015:ADA,
  author =       "Stephen Kyle and Hugh Leather and Bj{\"o}rn Franke and
                 Dave Butcher and Stuart Monteith",
  title =        "Application of Domain-aware Binary Fuzzing to Aid
                 {Android} Virtual Machine Testing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "121--132",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731198",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The development of a new application virtual machine
                 (VM), like the creation of any complex piece of
                 software, is a bug-prone process. In version 5.0, the
                 widely-used Android operating system has changed from
                 the Dalvik VM to the newly-developed ART VM to execute
                 Android applications. As new iterations of this VM are
                 released, how can the developers aim to reduce the
                 number of potentially security-threatening bugs that
                 make it into the final product? In this paper we
                 combine domain-aware binary fuzzing and differential
                 testing to produce DexFuzz, a tool that exploits the
                 presence of multiple modes of execution within a VM to
                 test for defects. These modes of execution include the
                 interpreter and a runtime that executes ahead-of-time
                 compiled code. We find and present a number of bugs in
                 the in-development version of ART in the Android Open
                 Source Project. We also assess DexFuzz's ability to
                 highlight defects in the experimental version of ART
                 released in the previous version of Android, 4.4,
                 finding 189 crashing programs and 15 divergent programs
                 that indicate defects after only 5,000 attempts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Suneja:2015:EVI,
  author =       "Sahil Suneja and Canturk Isci and Eyal de Lara and
                 Vasanth Bala",
  title =        "Exploring {VM} Introspection: Techniques and
                 Trade-offs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "133--146",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731196",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "While there are a variety of existing virtual machine
                 introspection (VMI) techniques, their latency,
                 overhead, complexity and consistency trade-offs are not
                 clear. In this work, we address this gap by first
                 organizing the various existing VMI techniques into a
                 taxonomy based upon their operational principles, so
                 that they can be put into context. Next we perform a
                 thorough exploration of their trade-offs both
                 qualitatively and quantitatively. We present a
                 comprehensive set of observations and best practices
                 for efficient, accurate and consistent VMI operation
                 based on our experiences with these techniques. Our
                 results show the stunning range of variations in
                 performance, complexity and overhead with different VMI
                 techniques.We further present a deep dive on VMI
                 consistency aspects to understand the sources of
                 inconsistency in observed VM state and show that,
                 contrary to common expectation, pause-and-introspect
                 based VMI techniques achieve very little to improve
                 consistency despite their substantial performance
                 impact.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Zeng:2015:PPH,
  author =       "Junyuan Zeng and Yangchun Fu and Zhiqiang Lin",
  title =        "{PEMU}: a Pin Highly Compatible Out-of-{VM} Dynamic
                 Binary Instrumentation Framework",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "147--160",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731201",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Over the past 20 years, we have witnessed a widespread
                 adoption of dynamic binary instrumentation (DBI) for
                 numerous program analyses and security applications
                 including program debugging, profiling, reverse
                 engineering, and malware analysis. To date, there are
                 many DBI platforms, and the most popular one is Pin,
                 which provides various instrumentation APIs for process
                 instrumentation. However, Pin does not support the
                 instrumentation of OS kernels. In addition, the
                 execution of the instrumentation and analysis routine
                 is always inside the virtual machine (VM).
                 Consequently, it cannot support any out-of-VM
                 introspection that requires strong isolation.
                 Therefore, this paper presents PEMU, a new open source
                 DBI framework that is compatible with Pin-APIs, but
                 supports out-of-VM introspection for both user level
                 processes and OS kernels. Unlike in-VM instrumentation
                 in which there is no semantic gap, for out-of-VM
                 introspection we have to bridge the semantic gap and
                 provide abstractions (i.e., APIs) for programmers. One
                 important feature of PEMU is its API compatibility with
                 Pin. As such, many Pin plugins are able to execute atop
                 PEMU without any source code modification. We have
                 implemented PEMU, and our experimental results with the
                 SPEC 2006 benchmarks show that PEMU introduces
                 reasonable overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Jaffer:2015:IRD,
  author =       "Shehbaz Jaffer and Piyus Kedia and Sorav Bansal",
  title =        "Improving Remote Desktopping Through Adaptive
                 Record\slash Replay",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "161--172",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731193",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Accessing the display of a computer remotely, is
                 popularly called remote desktopping. Remote desktopping
                 software installs at both the user-facing client
                 computer and the remote server computer; it simulates
                 user's input events at server, and streams the
                 corresponding display changes to client, thus providing
                 an illusion to the user of controlling the remote
                 machine using local input devices (e.g.,
                 keyboard/mouse). Many such remote desktopping tools are
                 widely used. We show that if the remote server is a
                 virtual machine (VM) and the client is reasonably
                 powerful (e.g., current laptop and desktop grade
                 hardware), VM deterministic replay capabilities can be
                 used adaptively to significantly reduce the network
                 bandwidth consumption and server-side CPU utilization
                 of a remote desktopping tool. We implement these
                 optimizations in a tool based on Qemu/KVM
                 virtualization platform and VNC remote desktopping
                 platform. Our tool reduces VNC's network bandwidth
                 consumption by up to 9x and server-side CPU utilization
                 by up to 56\% for popular graphics-intensive
                 applications. On the flip side, our techniques consume
                 higher CPU/memory/disk resources at the client. The
                 effect of our optimizations on user-perceived latency
                 is negligible.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Oh:2015:MWA,
  author =       "JinSeok Oh and Jin-woo Kwon and Hyukwoo Park and
                 Soo-Mook Moon",
  title =        "Migration of {Web} Applications with Seamless
                 Execution",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "173--185",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731197",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Web applications (apps) are programmed using HTML5,
                 CSS, and JavaScript, and are distributed in the source
                 code format. Web apps can be executed on any devices
                 where a web browser is installed, allowing one-source,
                 multi-platform environment. We can exploit this
                 advantage of platform independence for a new user
                 experience called app migration, which allows migrating
                 an app in the middle of execution seamlessly between
                 smart devices. This paper proposes such a migration
                 framework for web apps where we can save the current
                 state of a running app and resume its execution on a
                 different device by restoring the saved state. We save
                 the web app's state in the form of a snapshot, which is
                 actually another web app whose execution can restore
                 the saved state. In the snapshot, the state of the
                 JavaScript variables and DOM trees are saved using the
                 JSON format. We solved some of the saving/restoring
                 problems related to event handlers and closures by
                 accessing the browser and the JavaScript engine
                 internals. Our framework does not require instrumenting
                 an app or changing its source code, but works for the
                 original app. We implemented the framework on the
                 Chrome browser with the V8 JavaScript engine and
                 successfully migrated non-trivial sample apps with
                 reasonable saving and restoring overhead. We also
                 discuss other usage of the snapshot for optimizations
                 and user experiences for the web platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Ren:2015:ASE,
  author =       "Jianbao Ren and Yong Qi and Yuehua Dai and Xiaoguang
                 Wang and Yi Shi",
  title =        "{AppSec}: a Safe Execution Environment for Security
                 Sensitive Applications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "187--199",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731199",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Malicious OS kernel can easily access user's private
                 data in main memory and pries human-machine interaction
                 data, even one that employs privacy enforcement based
                 on application level or OS level. This paper introduces
                 AppSec, a hypervisor-based safe execution environment,
                 to protect both the memory data and human-machine
                 interaction data of security sensitive applications
                 from the untrusted OS transparently. AppSec provides
                 several security mechanisms on an untrusted OS. AppSec
                 introduces a safe loader to check the code integrity of
                 application and dynamic shared objects. During runtime,
                 AppSec protects application and dynamic shared objects
                 from being modified and verifies kernel memory accesses
                 according to application's intention. AppSec provides a
                 devices isolation mechanism to prevent the
                 human-machine interaction devices being accessed by
                 compromised kernel. On top of that, AppSec further
                 provides a privileged-based window system to protect
                 application's X resources. The major advantages of
                 AppSec are threefold. First, AppSec verifies and
                 protects all dynamic shared objects during runtime.
                 Second, AppSec mediates kernel memory access according
                 to application's intention but not encrypts all
                 application's data roughly. Third, AppSec provides a
                 trusted I/O path from end-user to application. A
                 prototype of AppSec is implemented and shows that
                 AppSec is efficient and practical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Jin:2015:HAS,
  author =       "Seongwook Jin and Jinho Seol and Jaehyuk Huh and
                 Seungryoul Maeng",
  title =        "Hardware-Assisted Secure Resource Accounting under a
                 Vulnerable Hypervisor",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "201--213",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731203",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "With the proliferation of cloud computing to outsource
                 computation in remote servers, the accountability of
                 computational resources has emerged as an important new
                 challenge for both cloud users and providers. Among the
                 cloud resources, CPU and memory are difficult to verify
                 their actual allocation, since the current
                 virtualization techniques attempt to hide the
                 discrepancy between physical and virtual allocations
                 for the two resources. This paper proposes an online
                 verifiable resource accounting technique for CPU and
                 memory allocation for cloud computing. Unlike prior
                 approaches for cloud resource accounting, the proposed
                 accounting mechanism, called Hardware-assisted Resource
                 Accounting (HRA), uses the hardware support for system
                 management mode (SMM) and virtualization to provide
                 secure resource accounting, even if the hypervisor is
                 compromised. Using a secure isolated execution support
                 of SMM, this study investigates two aspects of
                 verifiable resource accounting for cloud systems.
                 First, this paper presents how the hardware-assisted
                 SMM and virtualization techniques can be used to
                 implement the secure resource accounting mechanism even
                 under a compromised hypervisor. Second, the paper
                 investigates a sample-based resource accounting
                 technique to minimize performance overheads. Using a
                 statistical random sampling method, the technique
                 estimates the overall CPU and memory allocation status
                 with 99\%~100\% accuracies and performance degradations
                 of 0.1\%~0.5\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Cui:2015:PPA,
  author =       "Lei Cui and Tianyu Wo and Bo Li and Jianxin Li and Bin
                 Shi and Jinpeng Huai",
  title =        "{PARS}: a Page-Aware Replication System for
                 Efficiently Storing Virtual Machine Snapshots",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "7",
  pages =        "215--228",
  month =        jul,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2817817.2731190",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Virtual machine (VM) snapshot enhances the system
                 availability by saving the running state into stable
                 storage during failure-free execution and rolling back
                 to the snapshot point upon failures. Unfortunately, the
                 snapshot state may be lost due to disk failures, so
                 that the VM fails to be recovered. The popular
                 distributed file systems employ replication technique
                 to tolerate disk failures by placing redundant copies
                 across disperse disks. However, unless user-specific
                 personalization is provided, these systems consider the
                 data in the file as of same importance and create
                 identical copies of the entire file, leading to
                 non-trivial additional storage overhead. This paper
                 proposes a page-aware replication system (PARS) to
                 store VM snapshots efficiently. PARS employs VM
                 introspection technique to explore how a page is used
                 by guest, and classifies the pages by their importance
                 to system execution. If a page is critical, PARS
                 replicates it multiple copies to ensure high
                 availability and long-term durability. Otherwise, the
                 loss of this page causes no harm for system to work
                 properly, PARS therefore saves only one copy of the
                 page. Consequently, PARS improves storage efficiency
                 without compromising availability. We have implemented
                 PARS to justify its practicality. The experimental
                 results demonstrate that PARS achieves 53.9\% space
                 saving compared to the native replication approach in
                 HDFS which replicates the whole snapshot file fully and
                 identically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '15 conference proceedings.",
}

@Article{Gramoli:2015:MTY,
  author =       "Vincent Gramoli",
  title =        "More than you ever wanted to know about
                 synchronization: synchrobench, measuring the impact of
                 the synchronization on concurrent algorithms",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "1--10",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688501",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we present the most extensive
                 comparison of synchronization techniques. We evaluate 5
                 different synchronization techniques through a series
                 of 31 data structure algorithms from the recent
                 literature on 3 multicore platforms from Intel, Sun
                 Microsystems and AMD. To this end, we developed in
                 C/C++ and Java a new micro-benchmark suite, called
                 Synchrobench, hence helping the community evaluate new
                 data structures and synchronization techniques. The
                 main conclusion of this evaluation is threefold: (i)
                 although compare-and-swap helps achieving the best
                 performance on multicores, doing so correctly is hard;
                 (ii) optimistic locking offers varying performance
                 results while transactional memory offers more
                 consistent results; and (iii) copy-on-write and
                 read-copy-update suffer more from contention than any
                 other technique but could be combined with others to
                 derive efficient algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Alistarh:2015:SSR,
  author =       "Dan Alistarh and Justin Kopinsky and Jerry Li and Nir
                 Shavit",
  title =        "The {SprayList}: a scalable relaxed priority queue",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "11--20",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688523",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "High-performance concurrent priority queues are
                 essential for applications such as task scheduling and
                 discrete event simulation. Unfortunately, even the best
                 performing implementations do not scale past a number
                 of threads in the single digits. This is because of the
                 sequential bottleneck in accessing the elements at the
                 head of the queue in order to perform a DeleteMin
                 operation. In this paper, we present the SprayList, a
                 scalable priority queue with relaxed ordering
                 semantics. Starting from a non-blocking SkipList, the
                 main innovation behind our design is that the DeleteMin
                 operations avoid a sequential bottleneck by
                 ``spraying'' themselves onto the head of the SkipList
                 list in a coordinated fashion. The spraying is
                 implemented using a carefully designed random walk, so
                 that DeleteMin returns an element among the first $O(p
                 \log^3 p)$ in the list, with high probability, where $p$ is
                 the number of threads. We prove that the running time
                 of a DeleteMin operation is $O(\log^3 p)$, with high
                 probability, independent of the size of the list. Our
                 experiments show that the relaxed semantics allow the
                 data structure to scale for high thread counts,
                 comparable to a classic unordered SkipList.
                 Furthermore, we observe that, for reasonably parallel
                 workloads, the scalability benefits of relaxation
                 considerably outweigh the additional work due to
                 out-of-order execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Arbel:2015:PRR,
  author =       "Maya Arbel and Adam Morrison",
  title =        "Predicate {RCU}: an {RCU} for scalable concurrent
                 updates",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "21--30",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Read-copy update (RCU) is a shared memory
                 synchronization mechanism with scalable
                 synchronization-free reads that nevertheless execute
                 correctly with concurrent updates. To guarantee the
                 consistency of such reads, an RCU update transitioning
                 the data structure between certain states must wait for
                 the completion of all existing reads. Unfortunately,
                 these waiting periods quickly become a bottleneck, and
                 thus RCU remains unused in data structures that require
                 scalable, fine-grained, update operations. To solve
                 this problem, we present Predicate RCU (PRCU), an RCU
                 variant in which an update waits only for the reads
                 whose consistency it affects, which are specified by a
                 user-supplied predicate. We explore the trade-offs in
                 implementing PRCU, describing implementations that
                 reduce wait times by 10--100x with varying overhead on
                 reads on modern x86 multiprocessor machines. We
                 demonstrate the applicability of PRCU by applying it to
                 two RCU-based concurrent algorithms---the Citrus binary
                 search tree and a resizable hash table---and show
                 experimentally that PRCU significantly improves the
                 performance of both algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Golan-Gueta:2015:ASA,
  author =       "Guy Golan-Gueta and G. Ramalingam and Mooly Sagiv and
                 Eran Yahav",
  title =        "Automatic scalable atomicity via semantic locking",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "31--41",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688511",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we consider concurrent programs in
                 which the shared state consists of instances of
                 linearizable ADTs (abstract data types). We present an
                 automated approach to concurrency control that
                 addresses a common need: the need to atomically execute
                 a code fragment, which may contain multiple ADT
                 operations on multiple ADT instances. We present a
                 synthesis algorithm that automatically enforces
                 atomicity of given code fragments (in a client program)
                 by inserting pessimistic synchronization that
                 guarantees atomicity and deadlock-freedom (without
                 using any rollback mechanism). Our algorithm takes a
                 commutativity specification as an extra input. This
                 specification indicates for every pair of ADT
                 operations the conditions under which the operations
                 commute. Our algorithm enables greater parallelism by
                 permitting commuting operations to execute
                 concurrently. We have implemented the synthesis
                 algorithm in a Java compiler, and applied it to several
                 Java programs. Our results show that our approach
                 produces efficient and scalable synchronization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Benson:2015:FPP,
  author =       "Austin R. Benson and Grey Ballard",
  title =        "A framework for practical parallel fast matrix
                 multiplication",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "42--53",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688513",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Matrix multiplication is a fundamental computation in
                 many scientific disciplines. In this paper, we show
                 that novel fast matrix multiplication algorithms can
                 significantly outperform vendor implementations of the
                 classical algorithm and Strassen's fast algorithm on
                 modest problem sizes and shapes. Furthermore, we show
                 that the best choice of fast algorithm depends not only
                 on the size of the matrices but also the shape. We
                 develop a code generation tool to automatically
                 implement multiple sequential and shared-memory
                 parallel variants of each fast algorithm, including our
                 novel parallelization scheme. This allows us to rapidly
                 benchmark over 20 fast algorithms on several problem
                 sizes. Furthermore, we discuss a number of practical
                 implementation issues for these algorithms on
                 shared-memory machines that can direct further research
                 on making fast algorithms practical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "AMD Core Math Library (ACML); Cray Scientific Library
                 (LibSci); IBM Engineering and Scientific Subroutine
                 Library (ESSL); Intel MKL; LINPACK benchmark; numerical
                 instability of $O(N^p)$ algorithms with $p < 3$;
                 Strassen matrix multiplication; Strassen--Winograd
                 algorithm",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Acharya:2015:PNC,
  author =       "Aravind Acharya and Uday Bondhugula",
  title =        "{PLUTO+}: near-complete modeling of affine
                 transformations for parallelism and locality",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "54--64",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Affine transformations have proven to be very powerful
                 for loop restructuring due to their ability to model a
                 very wide range of transformations. A single
                 multi-dimensional affine function can represent a long
                 and complex sequence of simpler transformations.
                 Existing affine transformation frameworks like the
                 Pluto algorithm, that include a cost function for
                 modern multicore architectures where coarse-grained
                 parallelism and locality are crucial, consider only a
                 sub-space of transformations to avoid a combinatorial
                 explosion in finding the transformations. The ensuing
                 practical trade-offs lead to the exclusion of certain
                 useful transformations, in particular, transformation
                 compositions involving loop reversals and loop skewing
                 by negative factors. In this paper, we propose an
                 approach to address this limitation by modeling a much
                 larger space of affine transformations in conjunction
                 with the Pluto algorithm's cost function. We perform an
                 experimental evaluation of both, the effect on
                 compilation time, and performance of generated codes.
                 The evaluation shows that our new framework, Pluto+,
                 provides no degradation in performance in any of the
                 Polybench benchmarks. For Lattice Boltzmann Method
                 (LBM) codes with periodic boundary conditions, it
                 provides a mean speedup of 1.33x over Pluto. We also
                 show that Pluto+ does not increase compile times
                 significantly. Experimental results on Polybench show
                 that Pluto+ increases overall polyhedral
                 source-to-source optimization time only by 15\%. In
                 cases where it improves execution time significantly,
                 it increased polyhedral optimization time only by
                 2.04x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Ravishankar:2015:DMC,
  author =       "Mahesh Ravishankar and Roshan Dathathri and Venmugil
                 Elango and Louis-No{\"e}l Pouchet and J. Ramanujam and
                 Atanas Rountev and P. Sadayappan",
  title =        "Distributed memory code generation for mixed
                 irregular\slash regular computations",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "65--75",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688515",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many applications feature a mix of irregular and
                 regular computational structures. For example, codes
                 using adaptive mesh refinement (AMR) typically use a
                 collection of regular blocks, where the number of
                 blocks and the relationship between blocks is
                 irregular. The computational structure in such
                 applications generally involves regular (affine) loop
                 computations within some number of innermost loops,
                 while outer loops exhibit irregularity due to
                 data-dependent control flow and indirect array access
                 patterns. Prior approaches to distributed memory
                 parallelization do not handle such computations
                 effectively. They either target loop nests that are
                 completely affine using polyhedral frameworks, or treat
                 all loops as irregular. Consequently, the generated
                 distributed memory code contains artifacts that disrupt
                 the regular nature of previously affine innermost loops
                 of the computation. This hampers subsequent
                 optimizations to improve on-node performance. We
                 propose a code generation framework that can
                 effectively transform such applications for execution
                 on distributed memory systems. Our approach generates
                 distributed memory code which preserves program
                 properties that enable subsequent polyhederal
                 optimizations. Simultaneously, it addresses a major
                 memory bottleneck of prior techniques that limits the
                 scalability of the generated code. The effectiveness of
                 the proposed framework is demonstrated on computations
                 that are mixed regular/irregular, completely regular,
                 and completely irregular.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Xiang:2015:SPH,
  author =       "Lingxiang Xiang and Michael L. Scott",
  title =        "Software partitioning of hardware transactions",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "76--86",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688506",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Best-effort hardware transactional memory (HTM) allows
                 complex operations to execute atomically and in
                 parallel, so long as hardware buffers do not overflow,
                 and conflicts are not encountered with concurrent
                 operations. We describe a programming technique and
                 compiler support to reduce both overflow and conflict
                 rates by partitioning common operations into
                 read-mostly (planning) and write-mostly (completion)
                 operations, which then execute separately. The
                 completion operation remains transactional; planning
                 can often occur in ordinary code. High-level (semantic)
                 atomicity for the overall operation is ensured by
                 passing an application-specific validator object
                 between planning and completion. Transparent
                 composition of partitioned operations is made possible
                 through fully-automated compiler support, which
                 migrates all planning operations out of the parent
                 transaction while respecting all program data flow and
                 dependences. For both micro- and macro-benchmarks,
                 experiments on IBM z-Series and Intel Haswell machines
                 demonstrate that partitioning can lead to dramatically
                 lower abort rates and higher scalability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Baldassin:2015:PID,
  author =       "Alexandro Baldassin and Edson Borin and Guido Araujo",
  title =        "Performance implications of dynamic memory allocators
                 on transactional memory systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "87--96",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688504",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Although dynamic memory management accounts for a
                 significant part of the execution time on many modern
                 software systems, its impact on the performance of
                 transactional memory systems has been mostly
                 overlooked. In order to shed some light into this
                 subject, this paper conducts a thorough investigation
                 of the interplay between memory allocators and software
                 transactional memory (STM) systems. We show that
                 allocators can interfere with the way memory addresses
                 are mapped to versioned locks on state-of-the-art
                 software transactional memory implementations.
                 Moreover, we observed that key aspects of allocators
                 such as false sharing avoidance, scalability, and
                 locality have a drastic impact on the final
                 performance. For instance, we have detected performance
                 differences of up to 171\% in the STAMP applications
                 when using distinct allocators. Moreover, we show that
                 optimizations at the STM-level (such as caching
                 transactional objects) are not effective when a modern
                 allocator is already in use. All in all, our study
                 highlights the importance of reporting the allocator
                 utilized in the performance evaluation of transactional
                 memory systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Zhang:2015:LOS,
  author =       "Minjia Zhang and Jipeng Huang and Man Cao and Michael
                 D. Bond",
  title =        "Low-overhead software transactional memory with
                 progress guarantees and strong semantics",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "97--108",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software transactional memory offers an appealing
                 alternative to locks by improving programmability,
                 reliability, and scalability. However, existing STMs
                 are impractical because they add high instrumentation
                 costs and often provide weak progress guarantees and/or
                 semantics. This paper introduces a novel STM called
                 LarkTM that provides three significant features. (1)
                 Its instrumentation adds low overhead except when
                 accesses actually conflict, enabling low single-thread
                 overhead and scaling well on low-contention workloads.
                 (2) It uses eager concurrency control mechanisms, yet
                 naturally supports flexible conflict resolution,
                 enabling strong progress guarantees. (3) It naturally
                 provides strong atomicity semantics at low cost.
                 LarkTM's design works well for low-contention
                 workloads, but adds significant overhead under higher
                 contention, so we design an adaptive version of LarkTM
                 that uses alternative concurrency control for
                 high-contention objects. An implementation and
                 evaluation in a Java virtual machine show that the
                 basic and adaptive versions of LarkTM not only provide
                 low single-thread overhead, but their multithreaded
                 performance compares favorably with existing
                 high-performance STMs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Chabbi:2015:BEP,
  author =       "Milind Chabbi and Wim Lavrijsen and Wibe de Jong and
                 Koushik Sen and John Mellor-Crummey and Costin Iancu",
  title =        "Barrier elision for production parallel programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "109--119",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688502",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large scientific code bases are often composed of
                 several layers of runtime libraries, implemented in
                 multiple programming languages. In such situation,
                 programmers often choose conservative synchronization
                 patterns leading to suboptimal performance. In this
                 paper, we present context-sensitive dynamic
                 optimizations that elide barriers redundant during the
                 program execution. In our technique, we perform data
                 race detection alongside the program to identify
                 redundant barriers in their calling contexts; after an
                 initial learning, we start eliding all future instances
                 of barriers occurring in the same calling context. We
                 present an automatic on-the-fly optimization and a
                 multi-pass guided optimization. We apply our techniques
                 to NWChem--a 6 million line computational chemistry
                 code written in C/C++/Fortran that uses several runtime
                 libraries such as Global Arrays, ComEx, DMAPP, and MPI.
                 Our technique elides a surprisingly high fraction of
                 barriers (as many as 63\%) in production runs. This
                 redundancy elimination translates to application
                 speedups as high as 14\% on 2048 cores. Our techniques
                 also provided valuable insight about the application
                 behavior, later used by NWChem developers. Overall, we
                 demonstrate the value of holistic context-sensitive
                 analyses that consider the domain science in
                 conjunction with the associated runtime software
                 stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Thebault:2015:SEI,
  author =       "Lo{\"\i}c Th{\'e}bault and Eric Petit and Quang Dinh",
  title =        "Scalable and efficient implementation of {$3$D}
                 unstructured meshes computation: a case study on matrix
                 assembly",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "120--129",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688517",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Exposing massive parallelism on 3D unstructured meshes
                 computation with efficient load balancing and minimal
                 synchronizations is challenging. Current approaches
                 relying on domain decomposition and mesh coloring
                 struggle to scale with the increasing number of cores
                 per nodes, especially with new many-core processors. In
                 this paper, we propose an hybrid approach using domain
                 decomposition to exploit distributed memory
                 parallelism, Divide-and-Conquer, D{\&}C, to exploit
                 shared memory parallelism and improve locality, and
                 mesh coloring at core level to exploit vectors. It
                 illustrates a new trade-off for many-cores between
                 structuredness, memory locality, and vectorization. We
                 evaluate our approach on the finite element matrix
                 assembly of an industrial fluid dynamic code developed
                 by Dassault Aviation. We compare our D{\&}C approach to
                 domain decomposition and to mesh coloring. D{\&}C
                 achieves a high parallel efficiency, a good data
                 locality as well as an improved bandwidth usage. It
                 competes on current nodes with the optimized pure MPI
                 version with a minimum 10\% speed-up. D{\&}C shows an
                 impressive 319x strong scaling on 512 cores (32 nodes)
                 with only 2000 vertices per core. Finally, the Intel
                 Xeon Phi version has a performance similar to 10 Intel
                 E5-2665 Xeon Sandy Bridge cores and 95\% parallel
                 efficiency on the 60 physical cores. Running on 4 Xeon
                 Phi (240 cores), D{\&}C has 92\% efficiency on the
                 physical cores and performance similar to 33 Intel
                 E5-2665 Xeon Sandy Bridge cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Tallent:2015:DCS,
  author =       "Nathan R. Tallent and Abhinav Vishnu and Hubertus {Van
                 Dam} and Jeff Daily and Darren J. Kerbyson and Adolfy
                 Hoisie",
  title =        "Diagnosing the causes and severity of one-sided
                 message contention",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "130--139",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688516",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Two trends suggest network contention for one-sided
                 messages is poised to become a performance problem that
                 concerns application developers: an increased interest
                 in one-sided programming models and a rising ratio of
                 hardware threads to network injection bandwidth. Often
                 it is difficult to reason about when one-sided tasks
                 decrease or increase network contention. We present
                 effective and portable techniques for diagnosing the
                 causes and severity of one-sided message contention. To
                 detect that a message is affected by contention, we
                 maintain statistics representing instantaneous network
                 resource demand. Using lightweight measurement and
                 modeling, we identify the portion of a message's
                 latency that is due to contention and whether
                 contention occurs at the initiator or target. We
                 attribute these metrics to program statements in their
                 full static and dynamic context. We characterize
                 contention for an important computational chemistry
                 benchmark on InfiniBand, Cray Aries, and IBM Blue
                 Gene/Q interconnects. We pinpoint the sources of
                 contention, estimate their severity, and show that when
                 message delivery time deviates from an ideal model,
                 there are other messages contending for the same
                 network links. With a small change to the benchmark, we
                 reduce contention by 50\% and improve total runtime by
                 20\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Chang:2015:PAG,
  author =       "Yen-Jung Chang and Vijay K. Garg",
  title =        "A parallel algorithm for global states enumeration in
                 concurrent systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "140--149",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688520",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Verifying the correctness of the executions of a
                 concurrent program is difficult because of its
                 nondeterministic behavior. One of the verification
                 methods is predicate detection, which predicts whether
                 the user specified condition (predicate) could become
                 true in any global states of the program. The method is
                 predictive because it generates inferred execution
                 paths from the observed execution path and then checks
                 the predicate on the global states of inferred paths.
                 One important part of predicate detection is global
                 states enumeration, which generates the global states
                 on inferred paths. Cooper and Marzullo gave the first
                 enumeration algorithm based on a breadth first strategy
                 (BFS). Later, many algorithms have been proposed to
                 improve space and time complexity. None of them,
                 however, takes parallelism into consideration. In this
                 paper, we present the first parallel and online
                 algorithm, named ParaMount, for global state
                 enumeration. Our experimental results show that
                 ParaMount speeds up the existing sequential algorithms
                 by a factor of 6 with 8 threads. We have implemented an
                 online predicate detector using ParaMount. For
                 predicate detection, our detector based on ParaMount is
                 10 to 50 times faster than RV runtime (a verification
                 tool that uses Cooper and Marzullo's BFS enumeration
                 algorithm).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Cogumbreiro:2015:DDV,
  author =       "Tiago Cogumbreiro and Raymond Hu and Francisco Martins
                 and Nobuko Yoshida",
  title =        "Dynamic deadlock verification for general barrier
                 synchronisation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "150--160",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688519",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Armus, a dynamic verification tool for
                 deadlock detection and avoidance specialised in barrier
                 synchronisation. Barriers are used to coordinate the
                 execution of groups of tasks, and serve as a building
                 block of parallel computing. Our tool verifies more
                 barrier synchronisation patterns than current
                 state-of-the-art. To improve the scalability of
                 verification, we introduce a novel event-based
                 representation of concurrency constraints, and a
                 graph-based technique for deadlock analysis. The
                 implementation is distributed and fault-tolerant, and
                 can verify X10 and Java programs. To formalise the
                 notion of barrier deadlock, we introduce a core
                 language expressive enough to represent the three most
                 widespread barrier synchronisation patterns: group,
                 split-phase, and dynamic membership. We propose a graph
                 analysis technique that selects from two alternative
                 graph representations: the Wait-For Graph, that favours
                 programs with more tasks than barriers; and the State
                 Graph, optimised for programs with more barriers than
                 tasks. We prove that finding a deadlock in either
                 representation is equivalent, and that the verification
                 algorithm is sound and complete with respect to the
                 notion of deadlock in our core language. Armus is
                 evaluated with three benchmark suites in local and
                 distributed scenarios. The benchmarks show that graph
                 analysis with automatic graph-representation selection
                 can record a 7-fold execution increase versus the
                 traditional fixed graph representation. The performance
                 measurements for distributed deadlock detection between
                 64 processes show negligible overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{You:2015:VFO,
  author =       "Yi-Ping You and Hen-Jung Wu and Yeh-Ning Tsai and
                 Yen-Ting Chao",
  title =        "{VirtCL}: a framework for {OpenCL} device abstraction
                 and management",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "161--172",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688505",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The interest in using multiple graphics processing
                 units (GPUs) to accelerate applications has increased
                 in recent years. However, the existing heterogeneous
                 programming models (e.g., OpenCL) abstract details of
                 GPU devices at the per-device level and require
                 programmers to explicitly schedule their kernel tasks
                 on a system equipped with multiple GPU devices.
                 Unfortunately, multiple applications running on a
                 multi-GPU system may compete for some of the GPU
                 devices while leaving other GPU devices unused.
                 Moreover, the distributed memory model defined in
                 OpenCL, where each device has its own memory space,
                 increases the complexity of managing the memory among
                 multiple GPU devices. In this article we propose a
                 framework (called VirtCL) that reduces the programming
                 burden by acting as a layer between the programmer and
                 the native OpenCL run-time system for abstracting
                 multiple devices into a single virtual device and for
                 scheduling computations and communications among the
                 multiple devices. VirtCL comprises two main components:
                 (1) a front-end library, which exposes primary OpenCL
                 APIs and the virtual device, and (2) a back-end
                 run-time system (called CLDaemon) for scheduling and
                 dispatching kernel tasks based on a history-based
                 scheduler. The front-end library forwards computation
                 requests to the back-end CLDaemon, which then schedules
                 and dispatches the requests. We also propose a
                 history-based scheduler that is able to schedule kernel
                 tasks in a contention- and communication-aware manner.
                 Experiments demonstrated that the VirtCL framework
                 introduced a small overhead (mean of 6\%) but
                 outperformed the native OpenCL run-time system for most
                 benchmarks in the Rodinia benchmark suite, which was
                 due to the abstraction layer eliminating the
                 time-consuming initialization of OpenCL contexts. We
                 also evaluated different scheduling policies in VirtCL
                 with a real-world application (clsurf) and various
                 synthetic workload traces. The results indicated that
                 the VirtCL framework provides scalability for multiple
                 kernel tasks running on multi-GPU systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Ashari:2015:OML,
  author =       "Arash Ashari and Shirish Tatikonda and Matthias Boehm
                 and Berthold Reinwald and Keith Campbell and John
                 Keenleyside and P. Sadayappan",
  title =        "On optimizing machine learning workloads via kernel
                 fusion",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "173--182",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688521",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Exploitation of parallel architectures has become
                 critical to scalable machine learning (ML). Since a
                 wide range of ML algorithms employ linear algebraic
                 operators, GPUs with BLAS libraries are a natural
                 choice for such an exploitation. Two approaches are
                 commonly pursued: (i) developing specific GPU
                 accelerated implementations of complete ML algorithms;
                 and (ii) developing GPU kernels for primitive linear
                 algebraic operators like matrix-vector multiplication,
                 which are then used in developing ML algorithms. This
                 paper extends the latter approach by developing fused
                 kernels for a combination of primitive operators that
                 are commonly found in popular ML algorithms. We
                 identify the generic pattern of computation (alpha *
                 X^T (v * (X * y)) + beta * z) and its various
                 instantiations. We develop a fused kernel to optimize
                 this computation on GPUs --- with specialized
                 techniques to handle both sparse and dense matrices.
                 This approach not only reduces the cost of data loads
                 due to improved temporal locality but also enables
                 other optimizations like coarsening and hierarchical
                 aggregation of partial results. We also present an
                 analytical model that considers input data
                 characteristics and available GPU resources to estimate
                 near-optimal settings for kernel launch parameters. The
                 proposed approach provides speedups ranging from 2 to
                 67 for different instances of the generic pattern
                 compared to launching multiple operator-level kernels
                 using GPU accelerated libraries. We conclude by
                 demonstrating the effectiveness of the approach in
                 improving end-to-end performance on an entire ML
                 algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Zhang:2015:NAG,
  author =       "Kaiyuan Zhang and Rong Chen and Haibo Chen",
  title =        "{NUMA}-aware graph-structured analytics",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "183--193",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688507",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graph-structured analytics has been widely adopted in
                 a number of big data applications such as social
                 computation, web-search and recommendation systems.
                 Though much prior research focuses on scaling
                 graph-analytics on distributed environments, the strong
                 desire on performance per core, dollar and joule has
                 generated considerable interests of processing
                 large-scale graphs on a single server-class machine,
                 which may have several terabytes of RAM and 80 or more
                 cores. However, prior graph-analytics systems are
                 largely neutral to NUMA characteristics and thus have
                 suboptimal performance. This paper presents a detailed
                 study of NUMA characteristics and their impact on the
                 efficiency of graph-analytics. Our study uncovers two
                 insights: (1) either random or interleaved allocation of
                 graph data will significantly hamper data locality and
                 parallelism; (2) sequential inter-node (i.e., remote)
                 memory accesses have much higher bandwidth than both
                 intra- and inter-node random ones. Based on them, this
                 paper describes Polymer, a NUMA-aware graph-analytics
                 system on multicore with two key design decisions.
                 First, Polymer differentially allocates and places
                 topology data, application-defined data and mutable
                 runtime states of a graph system according to their
                 access patterns to minimize remote accesses. Second,
                 for some remaining random accesses, Polymer carefully
                 converts random remote accesses into sequential remote
                 accesses, by using lightweight replication of vertices
                 across NUMA nodes. To improve load balance and vertex
                 convergence, Polymer is further built with a
                 hierarchical barrier to boost parallelism and locality,
                 an edge-oriented balanced partitioning for skewed
                 graphs, and adaptive data structures according to the
                 proportion of active vertices. A detailed evaluation on
                 an 80-core machine shows that Polymer often outperforms
                 the state-of-the-art single-machine graph-analytics
                 systems, including Ligra, X-Stream and Galois, for a
                 set of popular real-world and synthetic graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Xie:2015:SAT,
  author =       "Chenning Xie and Rong Chen and Haibing Guan and Binyu
                 Zang and Haibo Chen",
  title =        "{SYNC} or {ASYNC}: time to fuse for distributed
                 graph-parallel computation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "194--204",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688508",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large-scale graph-structured computation usually
                 exhibits iterative and convergence-oriented computing
                 nature, where input data is computed iteratively until
                 a convergence condition is reached. Such features have
                 led to the development of two different computation
                 modes for graph-structured programs, namely synchronous
                 (Sync) and asynchronous (Async) modes. Unfortunately,
                 there is currently no in-depth study on their execution
                 properties and thus programmers have to manually choose
                 a mode, either requiring a deep understanding of
                 underlying graph engines, or suffering from suboptimal
                 performance. This paper makes the first comprehensive
                 characterization on the performance of the two modes on
                 a set of typical graph-parallel applications. Our study
                 shows that the performance of the two modes varies
                 significantly with different graph algorithms,
                 partitioning methods, execution stages, input graphs
                 and cluster scales, and no single mode consistently
                 outperforms the other. To this end, this paper proposes
                 Hsync, a hybrid graph computation mode that adaptively
                 switches a graph-parallel program between the two modes
                 for optimal performance. Hsync constantly collects
                 execution statistics on-the-fly and leverages a set of
                 heuristics to predict future performance and determine
                 when a mode switch could be profitable. We have built
                 online sampling and offline profiling approaches
                 combined with a set of heuristics to accurately
                 predicting future performance in the two modes. A
                 prototype called PowerSwitch has been built based on
                 PowerGraph, a state-of-the-art distributed
                 graph-parallel system, to support adaptive execution of
                 graph algorithms. On a 48-node EC2-like cluster,
                 PowerSwitch consistently outperforms the best of both
                 modes, with a speedup ranging from 9\% to 73\% due to
                 timely switch between two modes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Tang:2015:COW,
  author =       "Yuan Tang and Ronghui You and Haibin Kan and Jesmin
                 Jahan Tithi and Pramod Ganapathi and Rezaul A.
                 Chowdhury",
  title =        "Cache-oblivious wavefront: improving parallelism of
                 recursive dynamic programming algorithms without losing
                 cache-efficiency",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "205--214",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688514",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "State-of-the-art cache-oblivious parallel algorithms
                 for dynamic programming (DP) problems usually guarantee
                 asymptotically optimal cache performance without any
                 tuning of cache parameters, but they often fail to
                 exploit the theoretically best parallelism at the same
                 time. While these algorithms achieve cache-optimality
                 through the use of a recursive divide-and-conquer (DAC)
                 strategy, scheduling tasks at the granularity of task
                 dependency introduces artificial dependencies in
                 addition to those arising from the defining recurrence
                 equations. We removed the artificial dependency by
                 scheduling tasks ready for execution as soon as all its
                 real dependency constraints are satisfied, while
                 preserving the cache-optimality by inheriting the DAC
                 strategy. We applied our approach to a set of widely
                 known dynamic programming problems, such as
                 Floyd-Warshall's All-Pairs Shortest Paths, Stencil, and
                 LCS. Theoretical analyses show that our techniques
                 improve the span of 2-way DAC-based Floyd Warshall's
                 algorithm on an $n$ node graph from $ T h n^2 n$ to $ T
                 h n$, stencil computations on a $d$-dimensional
                 hypercubic grid of width $w$ for $h$ time steps from $
                 T h(d^2 h) w^(d + 2) - 1$ to $ T h h$, and LCS on
                 two sequences of length $n$ each from $ T h n^_2 3$ to
                 $ T h n$. In each case, the total work and cache
                 complexity remain asymptotically optimal. Experimental
                 measurements exhibit a $3$ --- $5$ times improvement in
                 absolute running time, $ 10$ --- $ 20$ times
                 improvement in burdened span by Cilkview, and
                 approximately the same L1/L2 cache misses by PAPI.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Chabbi:2015:HPL,
  author =       "Milind Chabbi and Michael Fagan and John
                 Mellor-Crummey",
  title =        "High performance locks for multi-level {NUMA}
                 systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "215--226",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688503",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Efficient locking mechanisms are critically important
                 for high performance computers. On highly-threaded
                 systems with a deep memory hierarchy, the throughput of
                 traditional queueing locks, e.g., MCS locks, falls off
                 due to NUMA effects. Two-level cohort locks perform
                 better on NUMA systems, but fail to deliver top
                 performance for deep NUMA hierarchies. In this paper,
                 we describe a hierarchical variant of the MCS lock that
                 adapts the principles of cohort locking for
                 architectures with deep NUMA hierarchies. We describe
                 analytical models for throughput and fairness of
                 Cohort-MCS (C-MCS) and Hierarchical MCS (HMCS) locks
                 that enable us to tailor these locks for high
                 performance on any target platform without empirical
                 tuning. Using these models, one can select parameters
                 such that an HMCS lock will deliver better fairness
                 than a C-MCS lock for a given throughput, or deliver
                 better throughput for a given fairness. Our experiments
                 show that, under high contention, a three-level HMCS
                 lock delivers up to 7.6x higher lock throughput than a
                 C-MCS lock on a 128-thread IBM Power 755 and a
                 five-level HMCS lock delivers up to 72x higher lock
                 throughput on a 4096-thread SGI UV 1000. On the K-means
                 clustering code from the MineBench suit, a three-level
                 HMCS lock reduces the running time by up to 55\%
                 compared to the C-MCS lock on a IBM Power 755.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Majo:2015:LPC,
  author =       "Zoltan Majo and Thomas R. Gross",
  title =        "A library for portable and composable data locality
                 optimizations for {NUMA} systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "227--238",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688509",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many recent multiprocessor systems are realized with a
                 non-uniform memory architecture (NUMA) and accesses to
                 remote memory locations take more time than local
                 memory accesses. Optimizing NUMA memory system
                 performance is difficult and costly for three principal
                 reasons: (1) today's programming languages/libraries
                 have no explicit support for NUMA systems, (2) NUMA
                 optimizations are not~portable, and (3) optimizations
                 are not~composable (i.e., they can become ineffective
                 or worsen performance in environments that support
                 composable parallel software). This paper presents
                 TBB-NUMA, a parallel programming library based on Intel
                 Threading Building Blocks (TBB) that supports portable
                 and composable NUMA-aware programming. TBB-NUMA
                 provides a model of task affinity that captures a
                 programmer's insights on mapping tasks to resources.
                 NUMA-awareness affects all layers of the library (i.e.,
                 resource management, task scheduling, and high-level
                 parallel algorithm templates) and requires close
                 coupling between all these layers. Optimizations
                 implemented with TBB-NUMA (for a set of standard
                 benchmark programs) result in up to 44\% performance
                 improvement over standard TBB, but more important,
                 optimized programs are portable across different NUMA
                 architectures and preserve data locality also when
                 composed with other parallel computations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Amer:2015:MRC,
  author =       "Abdelhalim Amer and Huiwei Lu and Yanjie Wei and Pavan
                 Balaji and Satoshi Matsuoka",
  title =        "{MPI+Threads}: runtime contention and remedies",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "239--248",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hybrid MPI+Threads programming has emerged as an
                 alternative model to the ``MPI everywhere'' model to
                 better handle the increasing core density in cluster
                 nodes. While the MPI standard allows multithreaded
                 concurrent communication, such flexibility comes with
                 the cost of maintaining thread safety within the MPI
                 implementation, typically implemented using critical
                 sections. In contrast to previous works that studied
                 the importance of critical-section granularity in MPI
                 implementations, in this paper we investigate the
                 implication of critical-section arbitration on
                 communication performance. We first analyze the MPI
                 runtime when multithreaded concurrent communication
                 takes place on hierarchical memory systems. Our results
                 indicate that the mutex-based approach that most MPI
                 implementations use today can incur performance
                 penalties due to unfair arbitration. We then present
                 methods to mitigate these penalties with a first-come,
                 first-served arbitration and a priority locking scheme
                 that favors threads doing useful work. Through
                 evaluations using several benchmarks and applications,
                 we demonstrate up to 5-fold improvement in
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{McPherson:2015:FPL,
  author =       "Andrew J. McPherson and Vijay Nagarajan and Susmit
                 Sarkar and Marcelo Cintra",
  title =        "Fence placement for legacy data-race-free programs via
                 synchronization read detection",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "249--250",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688524",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Fence placement is required to ensure legacy parallel
                 programs operate correctly on relaxed architectures.
                 The challenge is to place as few fences as possible
                 without compromising correctness. By identifying
                 necessary conditions for a read to be an acquire we
                 improve upon the state of the art for legacy DRF
                 programs by up to 2.64x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Piao:2015:JJF,
  author =       "Xianglan Piao and Channoh Kim and Younghwan Oh and
                 Huiying Li and Jincheon Kim and Hanjun Kim and Jae W.
                 Lee",
  title =        "{JAWS}: a {JavaScript} framework for adaptive
                 {CPU--GPU} work sharing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "251--252",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688525",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces jAWS, a JavaScript framework for
                 adaptive work sharing between CPU and GPU for
                 data-parallel workloads. Unlike conventional
                 heterogeneous parallel programming environments for
                 JavaScript, which use only one compute device when
                 executing a single kernel, jAWS accelerates kernel
                 execution by exploiting both devices to realize full
                 performance potential of heterogeneous multicores. jAWS
                 employs an efficient work partitioning algorithm that
                 finds an optimal work distribution between the two
                 devices without requiring offline profiling. The jAWS
                 runtime provides shared arrays for multiple parallel
                 contexts, hence eliminating extra copy overhead for
                 input and output data. Our preliminary evaluation with
                 both CPU-friendly and GPU-friendly benchmarks
                 demonstrates that jAWS provides good load balancing and
                 efficient data communication between parallel contexts,
                 to significantly outperform best single-device
                 execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Seo:2015:GGS,
  author =       "Hyunseok Seo and Jinwook Kim and Min-Soo Kim",
  title =        "{GStream}: a graph streaming processing method for
                 large-scale graphs on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "253--254",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688526",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Fast processing graph algorithms for large-scale
                 graphs becomes increasingly important. Besides, there
                 have been many attempts to process graph applications
                 by exploiting the massive amount of parallelism of
                 GPUs. However, most of the existing methods fail to
                 process large-scale graphs that do not fit in GPU
                 device memory. We propose a fast and scalable parallel
                 processing method GStream that fully exploits the
                 computational power of GPUs for processing large-scale
                 graphs (e.g., billions vertices) very efficiently. It
                 exploits the concept of nested-loop theta-join and
                 multiple asynchronous GPU streams. Extensive
                 experimental results show that GStream consistently and
                 significantly outperforms the state-of-the art
                 method.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Al-Saber:2015:SSA,
  author =       "Nabeel Al-Saber and Milind Kulkarni",
  title =        "{SemCache++}: semantics-aware caching for efficient
                 multi-{GPU} offloading",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "255--256",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688527",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Offloading computations to multiple GPUs is not an
                 easy task. It requires decomposing data, distributing
                 computations and handling communication manually.
                 Drop-in GPU libraries have made it easy to offload
                 computations to multiple GPUs by hiding this complexity
                 inside library calls. Such encapsulation prevents the
                 reuse of the data between successive kernel invocations
                 resulting in redundant communication. This limitation
                 exists in multi-GPU libraries like CUBLASXT. In this
                 paper, we introduce SemCache++, a semantics-aware GPU
                 cache that automatically manages communication between
                 the CPU and multiple GPUs in addition to optimizing
                 communication by eliminating redundant transfers using
                 caching. SemCache++ is used to build the first
                 multi-GPU drop-in replacement library that (a) uses the
                 virtual memory to automatically manage and optimize
                 multi-GPU communication and (b) requires no program
                 rewriting or annotations. Our caching technique is
                 efficient; it uses a two level caching directory to
                 track matrices and sub-matrices. Experimental results
                 show that our system can eliminate redundant
                 communication and deliver significant performance
                 improvements over multi-GPU libraries like CUBLASXT.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Kim:2015:OBU,
  author =       "Jungwon Kim and Seyong Lee and Jeffrey S. Vetter",
  title =        "An {OpenACC}-based unified programming model for
                 multi-accelerator systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "257--258",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper proposes a novel SPMD programming model of
                 OpenACC. Our model integrates the different
                 granularities of parallelism from vector-level
                 parallelism to node-level parallelism into a single,
                 unified model based on OpenACC. It allows programmers
                 to write programs for multiple accelerators using a
                 uniform programming model whether they are in shared or
                 distributed memory systems. We implement a prototype of
                 our model and evaluate its performance with a GPU-based
                 supercomputer using three benchmark applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Thomson:2015:LHB,
  author =       "Paul Thomson and Alastair F. Donaldson",
  title =        "The lazy happens-before relation: better partial-order
                 reduction for systematic concurrency testing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "259--260",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688533",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the lazy happens-before relation (lazy
                 HBR), which ignores mutex-induced edges to provide a
                 more precise notion of state equivalence compared with
                 the traditional happens-before relation. We demonstrate
                 experimentally that the lazy HBR has the potential to
                 provide greater schedule reduction during systematic
                 concurrency testing with respect to a set of 79 Java
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Haidar:2015:TBL,
  author =       "Azzam Haidar and Tingxing Dong and Piotr Luszczek and
                 Stanimire Tomov and Jack Dongarra",
  title =        "Towards batched linear solvers on accelerated hardware
                 platforms",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "261--262",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As hardware evolves, an increasingly effective
                 approach to develop energy efficient, high-performance
                 solvers, is to design them to work on many small and
                 independent problems. Indeed, many applications already
                 need this functionality, especially for GPUs, which are
                 known to be currently about four to five times more
                 energy efficient than multicore CPUs for every
                 floating-point operation. In this paper, we describe
                 the development of the main one-sided factorizations:
                 LU, QR, and Cholesky; that are needed for a set of
                 small dense matrices to work in parallel. We refer to
                 such algorithms as batched factorizations. Our approach
                 is based on representing the algorithms as a sequence
                 of batched BLAS routines for GPU-contained execution.
                 Note that this is similar in functionality to the
                 LAPACK and the hybrid MAGMA algorithms for large-matrix
                 factorizations. But it is different from a
                 straightforward approach, whereby each of GPU's
                 symmetric multiprocessors factorizes a single problem
                 at a time. We illustrate how our performance analysis
                 together with the profiling and tracing tools guided
                 the development of batched factorizations to achieve up
                 to 2-fold speedup and 3-fold better energy efficiency
                 compared to our highly optimized batched CPU
                 implementations based on the MKL library on a
                 two-sockets, Intel Sandy Bridge server. Compared to a
                 batched LU factorization featured in the NVIDIA's
                 CUBLAS library for GPUs, we achieves up to 2.5-fold
                 speedup on the K40 GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Muralidharan:2015:COP,
  author =       "Saurav Muralidharan and Michael Garland and Bryan
                 Catanzaro and Albert Sidelnik and Mary Hall",
  title =        "A collection-oriented programming model for
                 performance portability",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "263--264",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688537",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes Surge, a collection-oriented
                 programming model that enables programmers to compose
                 parallel computations using nested high-level data
                 collections and operators. Surge exposes a code
                 generation interface, decoupled from the core
                 computation, that enables programmers and autotuners to
                 easily generate multiple implementations of the same
                 computation on various parallel architectures such as
                 multi-core CPUs and GPUs. By decoupling computations
                 from architecture-specific implementation, programmers
                 can target multiple architectures more easily, and
                 generate a search space that facilitates optimization
                 and customization for specific architectures. We
                 express in Surge four real-world benchmarks from
                 domains such as sparse linear-algebra and machine
                 learning and from the same performance-portable
                 specification, generate OpenMP and CUDA C++
                 implementations. Surge generates efficient, scalable
                 code which achieves up to 1.32x speedup over
                 handcrafted, well-optimized CUDA code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Wang:2015:GHP,
  author =       "Yangzihao Wang and Andrew Davidson and Yuechao Pan and
                 Yuduo Wu and Andy Riffel and John D. Owens",
  title =        "{Gunrock}: a high-performance graph processing library
                 on the {GPU}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "265--266",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688538",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For large-scale graph analytics on the GPU, the
                 irregularity of data access and control flow and the
                 complexity of programming GPUs have been two
                 significant challenges for developing a programmable
                 high-performance graph library. ``Gunrock'', our
                 graph-processing system, uses a high-level
                 bulk-synchronous abstraction with traversal and
                 computation steps, designed specifically for the GPU.
                 Gunrock couples high performance with a high-level
                 programming model that allows programmers to quickly
                 develop new graph primitives with less than 300 lines
                 of code. We evaluate Gunrock on five graph primitives
                 and show that Gunrock has at least an order of
                 magnitude speedup over Boost and PowerGraph, comparable
                 performance to the fastest GPU hardwired primitives,
                 and better performance than any other GPU high-level
                 graph library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Pearce:2015:DLB,
  author =       "Olga Pearce and Todd Gamblin and Bronis R. de Supinski
                 and Martin Schulz and Nancy M. Amato",
  title =        "Decoupled load balancing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "267--268",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688539",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern scientific simulations divide work between
                 parallel processors by decomposing a spatial domain of
                 mesh cells, particles, or other elements. A balanced
                 assignment of the computational load is critical for
                 parallel performance. If the computation per element
                 changes over the simulation time, simulations can use
                 dynamic load balance algorithms to evenly redistribute
                 work to processes. Graph partitioners are widely used
                 and balance very effectively, but they do not strong
                 scale well. Typical SPMD simulations wait while a load
                 balance algorithm runs on all processors, so a poorly
                 scaling algorithm can itself become a bottleneck. We
                 observe that the load balance algorithm is separate
                 from the main application computation and has its own
                 scaling properties. We propose to decouple the load
                 balance algorithm from the application, and to offload
                 the load balance computation so that it runs
                 concurrently with the application on a smaller number
                 of processors. We demonstrate the costs of decoupling
                 and offloading the load balancing algorithm from a
                 Barnes--Hut application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Jin:2015:CPI,
  author =       "Ye Jin and Mingliang Liu and Xiaosong Ma and Qing Liu
                 and Jeremy Logan and Norbert Podhorszki and Jong Youl
                 Choi and Scott Klasky",
  title =        "Combining phase identification and statistic modeling
                 for automated parallel benchmark generation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "269--270",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688541",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parallel application benchmarks are indispensable for
                 evaluating/optimizing HPC software and hardware.
                 However, it is very challenging and costly to obtain
                 high-fidelity benchmarks reflecting the scale and
                 complexity of state-of-the-art parallel applications.
                 Hand-extracted synthetic benchmarks are time- and
                 labor-intensive to create. Real applications
                 themselves, while offering most accurate performance
                 evaluation, are expensive to compile, port,
                 reconfigure, and often plainly inaccessible due to
                 security or ownership concerns. This work contributes
                 APPRIME, a novel tool for trace-based automatic
                 parallel benchmark generation. Taking as input standard
                 communication-I/O traces of an application's execution,
                 it couples accurate automatic phase identification with
                 statistical regeneration of event parameters to create
                 compact, portable, and to some degree reconfigurable
                 parallel application benchmarks. Experiments with four
                 NAS Parallel Benchmarks (NPB) and three real scientific
                 simulation codes confirm the fidelity of APPRIME
                 benchmarks. They retain the original applications'
                 performance characteristics, in particular the relative
                 performance across platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Shi:2015:OAG,
  author =       "Xuanhua Shi and Junling Liang and Sheng Di and
                 Bingsheng He and Hai Jin and Lu Lu and Zhixiang Wang
                 and Xuan Luo and Jianlong Zhong",
  title =        "Optimization of asynchronous graph processing on {GPU}
                 with hybrid coloring model",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "271--272",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688542",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern GPUs have been widely used to accelerate the
                 graph processing for complicated computational problems
                 regarding graph theory. Many parallel graph algorithms
                 adopt the asynchronous computing model to accelerate
                 the iterative convergence. Unfortunately, the
                 consistent asynchronous computing requires locking or
                 the atomic operations, leading to significant
                 penalties/overheads when implemented on GPUs. To this
                 end, coloring algorithm is adopted to separate the
                 vertices with potential updating conflicts,
                 guaranteeing the consistency/correctness of the
                 parallel processing. We propose a light-weight
                 asynchronous processing framework called Frog with a
                 hybrid coloring model. We find that majority of
                 vertices (about 80\%) are colored with only a few
                 colors, such that they can be read and updated in a
                 very high degree of parallelism without violating the
                 sequential consistency. Accordingly, our solution will
                 separate the processing of the vertices based on the
                 distribution of colors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{West:2015:ERO,
  author =       "Scott West and Sebastian Nanz and Bertrand Meyer",
  title =        "Efficient and reasonable object-oriented concurrency",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "273--274",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688545",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Making threaded programs safe and easy to reason about
                 is one of the chief difficulties in modern programming.
                 This work provides an efficient execution model and
                 implementation for SCOOP, a concurrency approach that
                 provides not only data-race freedom but also
                 pre/postcondition reasoning guarantees between threads.
                 The extensions we propose influence the underlying
                 semantics to increase the amount of concurrent
                 execution that is possible, exclude certain classes of
                 deadlocks, and enable greater performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Vassiliadis:2015:PMR,
  author =       "Vassilis Vassiliadis and Konstantinos Parasyris and
                 Charalambos Chalios and Christos D. Antonopoulos and
                 Spyros Lalis and Nikolaos Bellas and Hans
                 Vandierendonck and Dimitrios S. Nikolopoulos",
  title =        "A programming model and runtime system for
                 significance-aware energy-efficient computing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "275--276",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688546",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a task-based programming model and
                 runtime system that exploit the observation that not
                 all parts of a program are equally significant for the
                 accuracy of the end-result, in order to trade off the
                 quality of program outputs for increased
                 energy-efficiency. This is done in a structured and
                 flexible way, allowing for easy exploitation of
                 different points in the quality/energy space, without
                 adversely affecting application performance. The
                 runtime system can apply a number of different policies
                 to decide whether it will execute less-significant
                 tasks accurately or approximately. The experimental
                 evaluation indicates that our system can achieve an
                 energy reduction of up to 83\% compared with a fully
                 accurate execution and up to 35\% compared with an
                 approximate version employing loop perforation. At the
                 same time, our approach always results in graceful
                 quality degradation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Wimmer:2015:LFK,
  author =       "Martin Wimmer and Jakob Gruber and Jesper Larsson
                 Tr{\"a}ff and Philippas Tsigas",
  title =        "The lock-free {$k$-LSM} relaxed priority queue",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "277--278",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688547",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new, concurrent, lock-free priority queue
                 that relaxes the delete-min operation to allow deletion
                 of any of the \rho smallest keys instead of only a
                 minimal one, where \rho is a parameter that can be
                 configured at runtime. It is built from a logarithmic
                 number of sorted arrays, similar to log-structured
                 merge-trees (LSM). For keys added and removed by the
                 same thread the behavior is identical to a non-relaxed
                 priority queue. We compare to state-of-the-art
                 lock-free priority queues with both relaxed and
                 non-relaxed semantics, showing high performance and
                 good scalability of our approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Saillard:2015:SDV,
  author =       "Emmanuelle Saillard and Patrick Carribault and Denis
                 Barthou",
  title =        "Static\slash dynamic validation of {MPI} collective
                 communications in multi-threaded context",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "279--280",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688548",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scientific applications mainly rely on the MPI
                 parallel programming model to reach high performance on
                 supercomputers. The advent of manycore architectures
                 (larger number of cores and lower amount of memory per
                 core) leads to mix MPI with a thread-based model like
                 OpenMP. But integrating two different programming
                 models inside the same application can be tricky and
                 generate complex bugs. Thus, the correctness of hybrid
                 programs requires a special care regarding MPI calls
                 location. For example, identical MPI collective
                 operations cannot be performed by multiple
                 non-synchronized threads. To tackle this issue, this
                 paper proposes a static analysis and a reduced dynamic
                 instrumentation to detect bugs related to misuse of MPI
                 collective operations inside or outside threaded
                 regions. This work extends PARCOACH designed for
                 MPI-only applications and keeps the compatibility with
                 these algorithms. We validated our method on multiple
                 hybrid benchmarks and applications with a low
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Ramachandran:2015:CFC,
  author =       "Arunmoezhi Ramachandran and Neeraj Mittal",
  title =        "{CASTLE}: fast concurrent internal binary search tree
                 using edge-based locking",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "281--282",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688551",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new lock-based algorithm for concurrent
                 manipulation of a binary search tree in an asynchronous
                 shared memory system that supports search, insert and
                 delete operations. Some of the desirable
                 characteristics of our algorithm are: (i) a search
                 operation uses only read and write instructions, (ii)
                 an insert operation does not acquire any locks, and
                 (iii) a delete operation only needs to lock up to four
                 edges in the absence of contention. Our algorithm is
                 based on an internal representation of a search tree
                 and it operates at edge-level (locks edges) rather than
                 at node-level (locks nodes); this minimizes the
                 contention window of a write operation and improves the
                 system throughput. Our experiments indicate that our
                 lock-based algorithm outperforms existing algorithms
                 for a concurrent binary search tree for medium-sized
                 and larger trees, achieving up to 59\% higher
                 throughput than the next best algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Das:2015:SBP,
  author =       "Madan Das and Gabriel Southern and Jose Renau",
  title =        "Section based program analysis to reduce overhead of
                 detecting unsynchronized thread communication",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "283--284",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688552",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose Section Based Program Analysis (SBPA), a
                 novel way to decompose programs into disjoint sections
                 to identify non-communicating loads and stores during
                 program compilation. We implemented SBPA for a
                 deterministic execution runtime environment and reduced
                 63\% of dynamic memory access instrumentations. We also
                 integrated SBPA with ThreadSanitizer, and achieved a
                 speed-up of 2.74 on a geometric mean basis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Harshvardhan:2015:HAR,
  author =       "Harshvardhan and Nancy M. Amato and Lawrence
                 Rauchwerger",
  title =        "A hierarchical approach to reducing communication in
                 parallel graph algorithms",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "285--286",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2700994",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large-scale graph computing has become critical due to
                 the ever-increasing size of data. However, distributed
                 graph computations are limited in their scalability and
                 performance due to the heavy communication inherent in
                 such computations. This is exacerbated in scale-free
                 networks, such as social and web graphs, which contain
                 hub vertices that have large degrees and therefore send
                 a large number of messages over the network.
                 Furthermore, many graph algorithms and computations
                 send the same data to each of the neighbors of a
                 vertex. Our proposed approach recognizes this, and
                 reduces communication performed by the algorithm
                 without change to user-code, through a hierarchical
                 machine model imposed upon the input graph. The
                 hierarchical model takes advantage of locale
                 information of the neighboring vertices to reduce
                 communication, both in message volume and total number
                 of bytes sent. It is also able to better exploit the
                 machine hierarchy to further reduce the communication
                 costs, by aggregating traffic between different levels
                 of the machine hierarchy. Results of an implementation
                 in the STAPL GL shows improved scalability and
                 performance over the traditional level-synchronous
                 approach, with 2.5$ \times $-8$ \times $ improvement
                 for a variety of graph algorithms at 12,000+ cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Chen:2015:TNL,
  author =       "Yifeng Chen and Xiang Cui and Hong Mei",
  title =        "{Tiles}: a new language mechanism for heterogeneous
                 parallelism",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "287--288",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2688555",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper studies the essence of heterogeneity from
                 the perspective of language mechanism design. The
                 proposed mechanism, called tiles, is a program
                 construct that bridges two relative levels of
                 computation: an outer level of source data in larger,
                 slower or more distributed memory and an inner level of
                 data blocks in smaller, faster or more localized
                 memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Radoi:2015:WAR,
  author =       "Cosmin Radoi and Stephan Herhut and Jaswanth Sreeram
                 and Danny Dig",
  title =        "Are web applications ready for parallelism?",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "8",
  pages =        "289--290",
  month =        aug,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858788.2700995",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:42 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In recent years, web applications have become
                 pervasive. Their backbone is JavaScript, the only
                 programming language supported by all major web
                 browsers. Most browsers run on desktop or mobile
                 devices with parallel hardware. However, JavaScript is
                 by design sequential, and current web applications make
                 little use of hardware parallelism. Are web
                 applications ready to exploit parallel hardware? We
                 answer the question in two steps: First, we survey 174
                 web developers about the potential and challenges of
                 using parallelism. Then, we study the performance and
                 computation shape of a set of web applications that are
                 representative for the emerging web. Our findings
                 indicate that emerging web applications do have latent
                 data parallelism, and JavaScript developers'
                 programming style is not a significant impediment to
                 exploiting this parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '15 conference proceedings.",
}

@Article{Bodik:2015:PSO,
  author =       "Rastislav Bodik",
  title =        "Program synthesis: opportunities for the next decade",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "1--1",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2789052",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Program synthesis is the contemporary answer to
                 automatic programming. It innovates in two ways: First,
                 it replaces batch automation with interactivity,
                 assisting the programmer in refining the understanding
                 of the programming problem. Second, it produces
                 programs using search in a candidate space rather than
                 by derivation from a specification. Searching for an
                 acceptable program means that we can accommodate
                 incomplete specifications, such as examples.
                 Additionally, search makes synthesis applicable to
                 domains that lack correct-by-construction derivation
                 rules, such as hardware design, education, end-user
                 programming, and systems biology. The future of
                 synthesis rests on four challenges, each presenting an
                 opportunity to develop novel abstractions for
                 ``programming with search.'' Larger scope: today, we
                 synthesize small, flat programs; synthesis of large
                 software will need constructs for modularity and
                 stepwise refinement. New interaction modes: to solicit
                 the specification without simply asking for more
                 examples, we need to impose a structure on the
                 candidate space and explore it in a dialogue.
                 Construction: how to compile a synthesis problem to a
                 search algorithm without building a compiler?
                 Everything is a program: whatever can be phrased as a
                 program can be in principle synthesized. Indeed, we
                 will see synthesis advance from synthesis of plain
                 programs to synthesis of compilers and languages. The
                 latter may include DSLs, type systems, and modeling
                 languages for biology. As such, synthesis could help
                 mechanize the crown jewel of programming languages
                 research --- the design of abstractions --- which has
                 so far been done manually and only by experts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Rompf:2015:FPS,
  author =       "Tiark Rompf and Nada Amin",
  title =        "Functional pearl: a {SQL} to {C} compiler in 500 lines
                 of code",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "2--9",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784760",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the design and implementation of a SQL
                 query processor that outperforms existing database
                 systems and is written in just about 500 lines of Scala
                 code --- a convincing case study that high-level
                 functional programming can handily beat C for
                 systems-level programming where the last drop of
                 performance matters. The key enabler is a shift in
                 perspective towards generative programming. The core of
                 the query engine is an interpreter for relational
                 algebra operations, written in Scala. Using the
                 open-source LMS Framework (Lightweight Modular
                 Staging), we turn this interpreter into a query
                 compiler with very low effort. To do so, we capitalize
                 on an old and widely known result from partial
                 evaluation known as Futamura projections, which state
                 that a program that can specialize an interpreter to
                 any given input program is equivalent to a compiler. In
                 this pearl, we discuss LMS programming patterns such as
                 mixed-stage data structures (e.g. data records with
                 static schema and dynamic field components) and
                 techniques to generate low-level C code, including
                 specialized data structures and data loading
                 primitives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Chlipala:2015:OCP,
  author =       "Adam Chlipala",
  title =        "An optimizing compiler for a purely functional
                 web-application language",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "10--21",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784741",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "High-level scripting languages have become
                 tremendously popular for development of dynamic Web
                 applications. Many programmers appreciate the
                 productivity benefits of automatic storage management,
                 freedom from verbose type annotations, and so on. While
                 it is often possible to improve performance
                 substantially by rewriting an application in C or a
                 similar language, very few programmers bother to do so,
                 because of the consequences for human development
                 effort. This paper describes a compiler that makes it
                 possible to have most of the best of both worlds,
                 coding Web applications in a high-level language but
                 compiling to native code with performance comparable to
                 handwritten C code. The source language is Ur/Web, a
                 domain-specific, purely functional, statically typed
                 language for the Web. Through a coordinated suite of
                 relatively straightforward program analyses and
                 algebraic optimizations, we transform Ur/Web programs
                 into almost-idiomatic C code, with no garbage
                 collection, little unnecessary memory allocation for
                 intermediate values, etc. Our compiler is in production
                 use for commercial Web sites supporting thousands of
                 users, and microbenchmarks demonstrate very competitive
                 performance versus mainstream tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Bauman:2015:PTJ,
  author =       "Spenser Bauman and Carl Friedrich Bolz and Robert
                 Hirschfeld and Vasily Kirilichev and Tobias Pape and
                 Jeremy G. Siek and Sam Tobin-Hochstadt",
  title =        "{Pycket}: a tracing {JIT} for a functional language",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "22--34",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784740",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Pycket, a high-performance tracing JIT
                 compiler for Racket. Pycket supports a wide variety of
                 the sophisticated features in Racket such as contracts,
                 continuations, classes, structures, dynamic binding,
                 and more. On average, over a standard suite of
                 benchmarks, Pycket outperforms existing compilers, both
                 Racket's JIT and other highly-optimizing Scheme
                 compilers. Further, Pycket provides much better
                 performance for Racket proxies than existing systems,
                 dramatically reducing the overhead of contracts and
                 gradual typing. We validate this claim with performance
                 evaluation on multiple existing benchmark suites. The
                 Pycket implementation is of independent interest as an
                 application of the RPython meta-tracing framework
                 (originally created for PyPy), which automatically
                 generates tracing JIT compilers from interpreters.
                 Prior work on meta-tracing focuses on bytecode
                 interpreters, whereas Pycket is a high-level
                 interpreter based on the CEK abstract machine and
                 operates directly on abstract syntax trees. Pycket
                 supports proper tail calls and first-class
                 continuations. In the setting of a functional language,
                 where recursion and higher-order functions are more
                 prevalent than explicit loops, the most significant
                 performance challenge for a tracing JIT is identifying
                 which control flows constitute a loop---we discuss two
                 strategies for identifying loops and measure their
                 impact.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Rossberg:2015:CMU,
  author =       "Andreas Rossberg",
  title =        "{1ML} --- core and modules united ({$F$}-ing
                 first-class modules)",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "35--47",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784738",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "ML is two languages in one: there is the core, with
                 types and expressions, and there are modules, with
                 signatures, structures and functors. Modules form a
                 separate, higher-order functional language on top of
                 the core. There are both practical and technical
                 reasons for this stratification; yet, it creates
                 substantial duplication in syntax and semantics, and it
                 reduces expressiveness. For example, selecting a module
                 cannot be made a dynamic decision. Language extensions
                 allowing modules to be packaged up as first-class
                 values have been proposed and implemented in different
                 variations. However, they remedy expressiveness only to
                 some extent, are syntactically cumbersome, and do not
                 alleviate redundancy. We propose a redesign of ML in
                 which modules are truly first-class values, and core
                 and module layer are unified into one language. In this
                 ``1ML'', functions, functors, and even type
                 constructors are one and the same construct; likewise,
                 no distinction is made between structures, records, or
                 tuples. Or viewed the other way round, everything is
                 just (``a mode of use of'') modules. Yet, 1ML does not
                 require dependent types, and its type structure is
                 expressible in terms of plain System F \omega , in a
                 minor variation of our F-ing modules approach. We
                 introduce both an explicitly typed version of 1ML, and
                 an extension with Damas/Milner-style implicit
                 quantification. Type inference for this language is not
                 complete, but, we argue, not substantially worse than
                 for Standard ML. An alternative view is that 1ML is a
                 user-friendly surface syntax for System F \omega that
                 allows combining term and type abstraction in a more
                 compositional manner than the bare calculus.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Vazou:2015:BRT,
  author =       "Niki Vazou and Alexander Bakst and Ranjit Jhala",
  title =        "Bounded refinement types",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "48--61",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784745",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a notion of bounded quantification for
                 refinement types and show how it expands the
                 expressiveness of refinement typing by using it to
                 develop typed combinators for: (1) relational algebra
                 and safe database access, (2) Floyd-Hoare logic within
                 a state transformer monad equipped with combinators for
                 branching and looping, and (3) using the above to
                 implement a refined IO monad that tracks capabilities
                 and resource usage. This leap in expressiveness comes
                 via a translation to ``ghost'' functions, which lets us
                 retain the automated and decidable SMT based checking
                 and inference that makes refinement typing effective in
                 practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Matsuda:2015:ABP,
  author =       "Kazutaka Matsuda and Meng Wang",
  title =        "Applicative bidirectional programming with lenses",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "62--74",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784750",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A bidirectional transformation is a pair of mappings
                 between source and view data objects, one in each
                 direction. When the view is modified, the source is
                 updated accordingly with respect to some laws. One way
                 to reduce the development and maintenance effort of
                 bidirectional transformations is to have specialized
                 languages in which the resulting programs are
                 bidirectional by construction---giving rise to the
                 paradigm of bidirectional programming. In this paper,
                 we develop a framework for applicative-style and
                 higher-order bidirectional programming, in which we can
                 write bidirectional transformations as unidirectional
                 programs in standard functional languages, opening up
                 access to the bundle of language features previously
                 only available to conventional unidirectional
                 languages. Our framework essentially bridges two very
                 different approaches of bidirectional programming,
                 namely the lens framework and Voigtl{\"a}nder's
                 semantic bidirectionalization, creating a new
                 programming style that is able to bag benefits from
                 both.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Pombrio:2015:HRC,
  author =       "Justin Pombrio and Shriram Krishnamurthi",
  title =        "Hygienic resugaring of compositional desugaring",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "75--87",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784755",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Syntactic sugar is widely used in language
                 implementation. Its benefits are, however, offset by
                 the comprehension problems it presents to programmers
                 once their program has been transformed. In particular,
                 after a transformed program has begun to evaluate (or
                 otherwise be altered by a black-box process), it can
                 become unrecognizable. We present a new approach to
                 _resugaring_ programs, which is the act of reflecting
                 evaluation steps in the core language in terms of the
                 syntactic sugar that the programmer used. Relative to
                 prior work, our approach has two important advances: it
                 handles hygiene, and it allows almost arbitrary
                 rewriting rules (as opposed to restricted patterns). We
                 do this in the context of a DAG representation of
                 programs, rather than more traditional trees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Geneves:2015:XST,
  author =       "Pierre Genev{\`e}s and Nils Gesbert",
  title =        "{XQuery} and static typing: tackling the problem of
                 backward axes",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "88--100",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784746",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "XQuery is a functional language dedicated to XML data
                 querying and manipulation. As opposed to other
                 W3C-standardized languages for XML (e.g. XSLT), it has
                 been intended to feature strong static typing.
                 Currently, however, some expressions of the language
                 cannot be statically typed with any precision. We argue
                 that this is due to a discrepancy between the semantics
                 of the language and its type algebra: namely, the
                 values of the language are (possibly inner) tree nodes,
                 which may have siblings and ancestors in the data. The
                 types on the other hand are regular tree types, as
                 usual in the XML world: they describe sets of trees.
                 The type associated to a node then corresponds to the
                 subtree whose root is that node and contains no
                 information about the rest of the data. This makes
                 navigation expressions using `backward axes,' which
                 return e.g. the siblings of a node, impossible to type.
                 We discuss how to handle this discrepancy by improving
                 the type system. We describe a logic-based language of
                 extended types able to represent inner tree nodes and
                 show how it can dramatically increase the precision of
                 typing for navigation expressions. We describe how
                 inclusion between these extended types and the
                 classical regular tree types can be decided, allowing a
                 hybrid system combining both type languages. The result
                 is a net increase in precision of typing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Bowman:2015:NF,
  author =       "William J. Bowman and Amal Ahmed",
  title =        "Noninterference for free",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "101--113",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784733",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The dependency core calculus (DCC) is a framework for
                 studying a variety of dependency analyses (e.g., secure
                 information flow). The key property provided by DCC is
                 noninterference, which guarantees that a low-level
                 observer (attacker) cannot distinguish high-level
                 (protected) computations. The proof of noninterference
                 for DCC suggests a connection to parametricity in
                 System F, which suggests that it should be possible to
                 implement dependency analyses in languages with
                 parametric polymorphism. We present a translation from
                 DCC into F \omega and prove that the translation
                 preserves noninterference. To express noninterference
                 in F \omega , we define a notion of observer-sensitive
                 equivalence that makes essential use of both
                 first-order and higher-order polymorphism. Our
                 translation provides insights into DCC's type system
                 and shows how DCC can be implemented in a polymorphic
                 language without loss of the noninterference (security)
                 guarantees available in DCC. Our contributions include
                 proof techniques that should be valuable when proving
                 other secure compilation or full abstraction results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Gaboardi:2015:ACL,
  author =       "Marco Gaboardi and Romain P{\'e}choux",
  title =        "Algebras and coalgebras in the light affine {Lambda}
                 calculus",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "114--126",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784759",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Algebra and coalgebra are widely used to model data
                 types in functional programming languages and proof
                 assistants. Their use permits to better structure the
                 computations and also to enhance the expressivity of a
                 language or of a proof system. Interestingly,
                 parametric polymorphism {\`a} la System F provides a
                 way to encode algebras and coalgebras in strongly
                 normalizing languages without losing the good logical
                 properties of the calculus. Even if these encodings are
                 sometimes unsatisfying because they provide only
                 limited forms of algebras and coalgebras, they give
                 insights on the expressivity of System F in terms of
                 functions that we can program in it. With the goal of
                 contributing to a better understanding of the
                 expressivity of Implicit Computational Complexity
                 systems, we study the problem of defining algebras and
                 coalgebras in the Light Affine Lambda Calculus, a
                 system characterizing the complexity class FPTIME. This
                 system limits the computational complexity of programs
                 but it also limits the ways we can use parametric
                 polymorphism, and in general the way we can write our
                 programs. We show here that while the restrictions
                 imposed by the Light Affine Lambda Calculus pose some
                 issues to the standard System F encodings, they still
                 permit to encode some form of algebra and coalgebra.
                 Using the algebra encoding one can define in the Light
                 Affine Lambda Calculus the traditional inductive types.
                 Unfortunately, the corresponding coalgebra encoding
                 permits only a very limited form of coinductive data
                 types. To extend this class we study an extension of
                 the Light Affine Lambda Calculus by distributive laws
                 for the modality \S . This extension has been discussed
                 but not studied before.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Downen:2015:SSR,
  author =       "Paul Downen and Philip Johnson-Freyd and Zena M.
                 Ariola",
  title =        "Structures for structural recursion",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "127--139",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784762",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Our goal is to develop co-induction from our
                 understanding of induction, putting them on level
                 ground as equal partners for reasoning about programs.
                 We investigate several structures which represent
                 well-founded forms of recursion in programs. These
                 simple structures encapsulate reasoning by primitive
                 and noetherian induction principles, and can be
                 composed together to form complex recursion schemes for
                 programs operating over a wide class of data and
                 co-data types. At its heart, this study is guided by
                 duality: each structure for recursion has a dual form,
                 giving perfectly symmetric pairs of equal and opposite
                 data and co-data types for representing recursion in
                 programs. Duality is brought out through a framework
                 presented in sequent style, which inherently includes
                 control effects that are interpreted logically as
                 classical reasoning principles. To accommodate the
                 presence of effects, we give a calculus parameterized
                 by a notion of strategy, which is strongly normalizing
                 for a wide range of strategies. We also present a more
                 traditional calculus for representing effect-free
                 functional programs, but at the cost of losing some of
                 the founding dualities.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Danner:2015:DCS,
  author =       "Norman Danner and Daniel R. Licata and Ramyaa Ramyaa",
  title =        "Denotational cost semantics for functional languages
                 with inductive types",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "140--151",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784749",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A central method for analyzing the asymptotic
                 complexity of a functional program is to extract and
                 then solve a recurrence that expresses evaluation cost
                 in terms of input size. The relevant notion of input
                 size is often specific to a datatype, with measures
                 including the length of a list, the maximum element in
                 a list, and the height of a tree. In this work, we give
                 a formal account of the extraction of cost and size
                 recurrences from higher-order functional programs over
                 inductive datatypes. Our approach allows a wide range
                 of programmer-specified notions of size, and ensures
                 that the extracted recurrences correctly predict
                 evaluation cost. To extract a recurrence from a
                 program, we first make costs explicit by applying a
                 monadic translation from the source language to a
                 complexity language, and then abstract datatype values
                 as sizes. Size abstraction can be done semantically,
                 working in models of the complexity language, or
                 syntactically, by adding rules to a preorder judgement.
                 We give several different models of the complexity
                 language, which support different notions of size.
                 Additionally, we prove by a logical relations argument
                 that recurrences extracted by this process are upper
                 bounds for evaluation cost; the proof is entirely
                 syntactic and therefore applies to all of the models we
                 consider.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Avanzini:2015:ACF,
  author =       "Martin Avanzini and Ugo {Dal Lago} and Georg Moser",
  title =        "Analysing the complexity of functional programs:
                 higher-order meets first-order",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "152--164",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784753",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We show how the complexity of higher-order functional
                 programs can be analysed automatically by applying
                 program transformations to a defunctionalised versions
                 of them, and feeding the result to existing tools for
                 the complexity analysis of first-order term rewrite
                 systems. This is done while carefully analysing
                 complexity preservation and reflection of the employed
                 transformations such that the complexity of the
                 obtained term rewrite system reflects on the complexity
                 of the initial program. Further, we describe suitable
                 strategies for the application of the studied
                 transformations and provide ample experimental data for
                 assessing the viability of our method.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Sheeran:2015:FPH,
  author =       "Mary Sheeran",
  title =        "Functional programming and hardware design: still
                 interesting after all these years",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "165--165",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2789053",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Higher order functions provide an elegant way to
                 express algorithms designed for implementation in
                 hardware. By showing examples of both classic and new
                 algorithms, I will explain why higher order functions
                 deserve to be studied. Next, I will consider the extent
                 to which ideas from functional programming, and
                 associated formal verification methods, have influenced
                 hardware design in practice. What can we learn from
                 looking back? You might ask ``Why are methods of
                 hardware design still important to our community?''.
                 Maybe we should just give up? One reason for not giving
                 up is that hardware design is really a form of parallel
                 programming. And here there is still a lot to do!
                 Inspired by Blelloch's wonderful invited talk at ICFP
                 2010, I still believe that functional programming has
                 much to offer in the central question of how to program
                 the parallel machines of today, and, more particularly,
                 of the future. I will briefly present some of the areas
                 where I think that we are poised to make great
                 contributions. But maybe we need to work harder on
                 getting our act together?",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Neis:2015:PCV,
  author =       "Georg Neis and Chung-Kil Hur and Jan-Oliver Kaiser and
                 Craig McLaughlin and Derek Dreyer and Viktor
                 Vafeiadis",
  title =        "{Pilsner}: a compositionally verified compiler for a
                 higher-order imperative language",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "166--178",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784764",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compiler verification is essential for the
                 construction of fully verified software, but most prior
                 work (such as CompCert) has focused on verifying
                 whole-program compilers. To support separate
                 compilation and to enable linking of results from
                 different verified compilers, it is important to
                 develop a compositional notion of compiler correctness
                 that is modular (preserved under linking), transitive
                 (supports multi-pass compilation), and flexible
                 (applicable to compilers that use different
                 intermediate languages or employ non-standard program
                 transformations). In this paper, building on prior work
                 of Hur et al., we develop a novel approach to
                 compositional compiler verification based on parametric
                 inter-language simulations (PILS). PILS are modular:
                 they enable compiler verification in a manner that
                 supports separate compilation. PILS are transitive: we
                 use them to verify Pilsner, a simple (but non-trivial)
                 multi-pass optimizing compiler (programmed in Coq) from
                 an ML-like source language S to an assembly-like target
                 language T, going through a CPS-based intermediate
                 language. Pilsner is the first multi-pass compiler for
                 a higher-order imperative language to be
                 compositionally verified. Lastly, PILS are flexible: we
                 use them to additionally verify (1) Zwickel, a direct
                 non-optimizing compiler for S, and (2) a hand-coded
                 self-modifying T module, proven correct w.r.t. an
                 S-level specification. The output of Zwickel and the
                 self-modifying T module can then be safely linked
                 together with the output of Pilsner. All together, this
                 has been a significant undertaking, involving several
                 person-years of work and over 55,000 lines of Coq.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Ziliani:2015:UAC,
  author =       "Beta Ziliani and Matthieu Sozeau",
  title =        "A unification algorithm for {Coq} featuring universe
                 polymorphism and overloading",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "179--191",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784751",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Unification is a core component of every proof
                 assistant or programming language featuring dependent
                 types. In many cases, it must deal with higher-order
                 problems up to conversion. Since unification in such
                 conditions is undecidable, unification algorithms may
                 include several heuristics to solve common problems.
                 However, when the stack of heuristics grows large, the
                 result and complexity of the algorithm can become
                 unpredictable. Our contributions are twofold: (1) We
                 present a full description of a new unification
                 algorithm for the Calculus of Inductive Constructions
                 (the base logic of Coq), including universe
                 polymorphism, canonical structures (the overloading
                 mechanism baked into Coq's unification), and a small
                 set of useful heuristics. (2) We implemented our
                 algorithm, and tested it on several libraries,
                 providing evidence that the selected set of heuristics
                 suffices for large developments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Blanchette:2015:FEC,
  author =       "Jasmin Christian Blanchette and Andrei Popescu and
                 Dmitriy Traytel",
  title =        "Foundational extensible corecursion: a proof assistant
                 perspective",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "192--204",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784732",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a formalized framework for
                 defining corecursive functions safely in a total
                 setting, based on corecursion up-to and relational
                 parametricity. The end product is a general corecursor
                 that allows corecursive (and even recursive) calls
                 under ``friendly'' operations, including constructors.
                 Friendly corecursive functions can be registered as
                 such, thereby increasing the corecursor's
                 expressiveness. The metatheory is formalized in the
                 Isabelle proof assistant and forms the core of a
                 prototype tool. The corecursor is derived from first
                 principles, without requiring new axioms or extensions
                 of the logic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Steuwer:2015:GPP,
  author =       "Michel Steuwer and Christian Fensch and Sam Lindley
                 and Christophe Dubach",
  title =        "Generating performance portable code using rewrite
                 rules: from high-level functional expressions to
                 high-performance {OpenCL} code",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "205--217",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computers have become increasingly complex with the
                 emergence of heterogeneous hardware combining multicore
                 CPUs and GPUs. These parallel systems exhibit
                 tremendous computational power at the cost of increased
                 programming effort resulting in a tension between
                 performance and code portability. Typically, code is
                 either tuned in a low-level imperative language using
                 hardware-specific optimizations to achieve maximum
                 performance or is written in a high-level, possibly
                 functional, language to achieve portability at the
                 expense of performance. We propose a novel approach
                 aiming to combine high-level programming, code
                 portability, and high-performance. Starting from a
                 high-level functional expression we apply a simple set
                 of rewrite rules to transform it into a low-level
                 functional representation, close to the OpenCL
                 programming model, from which OpenCL code is generated.
                 Our rewrite rules define a space of possible
                 implementations which we automatically explore to
                 generate hardware-specific OpenCL implementations. We
                 formalize our system with a core dependently-typed
                 lambda-calculus along with a denotational semantics
                 which we use to prove the correctness of the rewrite
                 rules. We test our design in practice by implementing a
                 compiler which generates high performance imperative
                 OpenCL code. Our experiments show that we can
                 automatically derive hardware-specific implementations
                 from simple functional high-level algorithmic
                 expressions offering performance on a par with highly
                 tuned code for multicore CPUs and GPUs written by
                 experts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Newton:2015:ALF,
  author =       "Ryan R. Newton and Peter P. Fogg and Ali Varamesh",
  title =        "Adaptive lock-free maps: purely-functional to
                 scalable",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "218--229",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784734",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Purely functional data structures stored inside a
                 mutable variable provide an excellent concurrent data
                 structure-obviously correct, cheap to create, and
                 supporting snapshots. They are not, however, scalable.
                 We provide a way to retain the benefits of these
                 pure-in-a-box data structures while dynamically
                 converting to a more scalable lock-free data structure
                 under contention. Our solution scales to any pair of
                 pure and lock-free container types with key/value set
                 semantics, while retaining lock-freedom. We demonstrate
                 the principle in action on two very different
                 platforms: first in the Glasgow Haskell Compiler and
                 second in Java. To this end we extend GHC to support
                 lock-free data structures and introduce a new approach
                 for safe CAS in a lazy language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Le:2015:PAT,
  author =       "Matthew Le and Matthew Fluet",
  title =        "Partial aborts for transactions via first-class
                 continuations",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "230--242",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784736",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software transactional memory (STM) has proven to be a
                 useful abstraction for developing concurrent
                 applications, where programmers denote transactions
                 with an atomic construct that delimits a collection of
                 reads and writes to shared mutable references. The
                 runtime system then guarantees that all transactions
                 are observed to execute atomically with respect to each
                 other. Traditionally, when the runtime system detects
                 that one transaction conflicts with another, it aborts
                 one of the transactions and restarts its execution from
                 the beginning. This can lead to problems with both
                 execution time and throughput. In this paper, we
                 present a novel approach that uses first-class
                 continuations to restart a conflicting transaction at
                 the point of a conflict, avoiding the re-execution of
                 any work from the beginning of the transaction that has
                 not been compromised. In practice, this allows
                 transactions to complete more quickly, decreasing
                 execution time and increasing throughput. We have
                 implemented this idea in the context of the Manticore
                 project, an ML-family language with support for
                 parallelism and concurrency. Crucially, we rely on
                 constant-time continuation capturing via a
                 continuation-passing-style (CPS) transformation and
                 heap-allocated continuations. When comparing our STM
                 that performs partial aborts against one that performs
                 full aborts, we achieve a decrease in execution time of
                 up to 31\% and an increase in throughput of up to
                 351\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Scherer:2015:WST,
  author =       "Gabriel Scherer and Didier R{\'e}my",
  title =        "Which simple types have a unique inhabitant?",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "243--255",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784757",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study the question of whether a given type has a
                 unique inhabitant modulo program equivalence. In the
                 setting of simply-typed lambda-calculus with sums,
                 equipped with the strong --equivalence, we show that
                 uniqueness is decidable. We present a saturating
                 focused logic that introduces irreducible cuts on
                 positive types ``as soon as possible''. Backward search
                 in this logic gives an effective algorithm that returns
                 either zero, one or two distinct inhabitants for any
                 given type. Preliminary application studies show that
                 such a feature can be useful in strongly-typed
                 programs, inferring the code of highly-polymorphic
                 library functions, or ``glue code'' inside more complex
                 terms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Dunfield:2015:EEO,
  author =       "Joshua Dunfield",
  title =        "Elaborating evaluation-order polymorphism",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "256--268",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784744",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We classify programming languages according to
                 evaluation order: each language fixes one evaluation
                 order as the default, making it transparent to program
                 in that evaluation order, and troublesome to program in
                 the other. This paper develops a type system that is
                 impartial with respect to evaluation order. Evaluation
                 order is implicit in terms, and explicit in types, with
                 by-value and by-name versions of type connectives. A
                 form of intersection type quantifies over evaluation
                 orders, describing code that is agnostic over (that is,
                 polymorphic in) evaluation order. By allowing such
                 generic code, programs can express the by-value and
                 by-name versions of a computation without code
                 duplication. We also formulate a type system that only
                 has by-value connectives, plus a type that generalizes
                 the difference between by-value and by-name
                 connectives: it is either a suspension (by name) or a
                 ``no-op'' (by value). We show a straightforward
                 encoding of the impartial type system into the more
                 economical one. Then we define an elaboration from the
                 economical language to a call-by-value semantics, and
                 prove that elaborating a well-typed source program,
                 where evaluation order is implicit, produces a
                 well-typed target program where evaluation order is
                 explicit. We also prove a simulation between evaluation
                 of the target program and reductions (either by-value
                 or by-name) in the source program. Finally, we prove
                 that typing, elaboration, and evaluation are faithful
                 to the type annotations given in the source program: if
                 the programmer only writes by-value types, no by-name
                 reductions can occur at run time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Rendel:2015:ARL,
  author =       "Tillmann Rendel and Julia Trieflinger and Klaus
                 Ostermann",
  title =        "Automatic refunctionalization to a language with
                 copattern matching: with applications to the expression
                 problem",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "269--279",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784763",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "Defunctionalization and refunctionalization establish
                 a correspondence between first-class functions and
                 pattern matching, but the correspondence is not
                 symmetric: Not all uses of pattern matching can be
                 automatically refunctionalized to uses of higher-order
                 functions. To remedy this asymmetry, we generalize from
                 first-class functions to arbitrary codata. This leads
                 us to full defunctionalization and refunctionalization
                 between a codata language based on copattern matching
                 and a data language based on pattern matching. We
                 observe how programs can be written as matrices so that
                 they are modularly extensible in one dimension but not
                 the other. In this representation, defunctionalization
                 and refunctionalization correspond to matrix
                 transposition which effectively changes the dimension
                 of extensibility a program supports. This suggests
                 applications to the expression problem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Russo:2015:FPT,
  author =       "Alejandro Russo",
  title =        "Functional pearl: two can keep a secret, if one of
                 them uses {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "280--288",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784756",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For several decades, researchers from different
                 communities have independently focused on protecting
                 confidentiality of data. Two distinct technologies have
                 emerged for such purposes: Mandatory Access Control
                 (MAC) and Information-Flow Control (IFC)-the former
                 belonging to operating systems (OS) research, while the
                 latter to the programming languages community. These
                 approaches restrict how data gets propagated within a
                 system in order to avoid information leaks. In this
                 scenario, Haskell plays a unique privileged role: it is
                 able to protect confidentiality via libraries. This
                 pearl presents a monadic API which statically protects
                 confidentiality even in the presence of advanced
                 features like exceptions, concurrency, and mutable data
                 structures. Additionally, we present a mechanism to
                 safely extend the library with new primitives, where
                 library designers only need to indicate the read and
                 write effects of new operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Buiras:2015:HMS,
  author =       "Pablo Buiras and Dimitrios Vytiniotis and Alejandro
                 Russo",
  title =        "{HLIO}: mixing static and dynamic typing for
                 information-flow control in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "289--301",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784758",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Information-Flow Control (IFC) is a well-established
                 approach for allowing untrusted code to manipulate
                 sensitive data without disclosing it. IFC is typically
                 enforced via type systems and static analyses or via
                 dynamic execution monitors. The LIO Haskell library,
                 originating in operating systems research, implements a
                 purely dynamic monitor of the sensitivity level of a
                 computation, particularly suitable when data
                 sensitivity levels are only known at runtime. In this
                 paper, we show how to give programmers the flexibility
                 of deferring IFC checks to runtime (as in LIO), while
                 also providing static guarantees---and the absence of
                 runtime checks---for parts of their programs that can
                 be statically verified (unlike LIO). We present the
                 design and implementation of our approach, HLIO (Hybrid
                 LIO), as an embedding in Haskell that uses a novel
                 technique for deferring IFC checks based on singleton
                 types and constraint polymorphism. We formalize HLIO,
                 prove non-interference, and show how interesting IFC
                 examples can be programmed. Although our motivation is
                 IFC, our technique for deferring constraints goes well
                 beyond and offers a methodology for
                 programmer-controlled hybrid type checking in
                 Haskell.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{vanderPloeg:2015:PPF,
  author =       "Atze van der Ploeg and Koen Claessen",
  title =        "Practical principled {FRP}: forget the past, change
                 the future, {FRPNow}!",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "302--314",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784752",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new interface for practical Functional
                 Reactive Programming (FRP) that (1) is close in spirit
                 to the original FRP ideas, (2) does not have the
                 original space-leak problems, without using arrows or
                 advanced types, and (3) provides a simple and
                 expressive way for performing IO actions from FRP code.
                 We also provide a denotational semantics for this new
                 interface, and a technique (using Kripke logical
                 relations) for reasoning about which FRP functions may
                 ``forget their past'', i.e. which functions do not have
                 an inherent space-leak. Finally, we show how we have
                 implemented this interface as a Haskell library called
                 FRPNow.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Bahr:2015:CSM,
  author =       "Patrick Bahr and Jost Berthold and Martin Elsman",
  title =        "Certified symbolic management of financial multi-party
                 contracts",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "315--327",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784747",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Domain-specific languages (DSLs) for complex financial
                 contracts are in practical use in many banks and
                 financial institutions today. Given the level of
                 automation and pervasiveness of software in the sector,
                 the financial domain is immensely sensitive to software
                 bugs. At the same time, there is an increasing need to
                 analyse (and report on) the interaction between
                 multiple parties. In this paper, we present a
                 multi-party contract language that rigorously relegates
                 any artefacts of simulation and computation from its
                 core, which leads to favourable algebraic properties,
                 and therefore allows for formalising domain-specific
                 analyses and transformations using a proof assistant.
                 At the centre of our formalisation is a simple
                 denotational semantics independent of any stochastic
                 aspects. Based on this semantics, we devise certified
                 contract analyses and transformations. In particular,
                 we give a type system, with an accompanying type
                 inference procedure, that statically ensures that
                 contracts follow the principle of causality. Moreover,
                 we devise a reduction semantics that allows us to
                 evolve contracts over time, in accordance with the
                 denotational semantics. From the verified Coq
                 definitions, we automatically extract a Haskell
                 implementation of an embedded contract DSL along with
                 the formally verified contract management
                 functionality. This approach opens a road map towards
                 more reliable contract management software, including
                 the possibility of analysing contracts based on
                 symbolic instead of numeric methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Smolka:2015:FCN,
  author =       "Steffen Smolka and Spiridon Eliopoulos and Nate Foster
                 and Arjun Guha",
  title =        "A fast compiler for {NetKAT}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "328--341",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784761",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "High-level programming languages play a key role in a
                 growing number of networking platforms, streamlining
                 application development and enabling precise formal
                 reasoning about network behavior. Unfortunately,
                 current compilers only handle ``local'' programs that
                 specify behavior in terms of hop-by-hop forwarding
                 behavior, or modest extensions such as simple paths. To
                 encode richer ``global'' behaviors, programmers must
                 add extra state --- something that is tricky to get
                 right and makes programs harder to write and maintain.
                 Making matters worse, existing compilers can take tens
                 of minutes to generate the forwarding state for the
                 network, even on relatively small inputs. This forces
                 programmers to waste time working around performance
                 issues or even revert to using hardware-level APIs.
                 This paper presents a new compiler for the NetKAT
                 language that handles rich features including regular
                 paths and virtual networks, and yet is several orders
                 of magnitude faster than previous compilers. The
                 compiler uses symbolic automata to calculate the extra
                 state needed to implement ``global'' programs, and an
                 intermediate representation based on binary decision
                 diagrams to dramatically improve performance. We
                 describe the design and implementation of three
                 essential compiler stages: from virtual programs (which
                 specify behavior in terms of virtual topologies) to
                 global programs (which specify network-wide behavior in
                 terms of physical topologies), from global programs to
                 local programs (which specify behavior in terms of
                 single-switch behavior), and from local programs to
                 hardware-level forwarding tables. We present results
                 from experiments on real-world benchmarks that quantify
                 performance in terms of compilation time and forwarding
                 table size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Stucki:2015:RVP,
  author =       "Nicolas Stucki and Tiark Rompf and Vlad Ureche and
                 Phil Bagwell",
  title =        "{RRB} vector: a practical general purpose immutable
                 sequence",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "342--354",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784739",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "State-of-the-art immutable collections have wildly
                 differing performance characteristics across their
                 operations, often forcing programmers to choose
                 different collection implementations for each task.
                 Thus, changes to the program can invalidate the choice
                 of collections, making code evolution costly. It would
                 be desirable to have a collection that performs well
                 for a broad range of operations. To this end, we
                 present the RRB-Vector, an immutable sequence
                 collection that offers good performance across a large
                 number of sequential and parallel operations. The
                 underlying innovations are: (1) the
                 Relaxed-Radix-Balanced (RRB) tree structure, which
                 allows efficient structural reorganization, and (2) an
                 optimization that exploits spatio-temporal locality on
                 the RRB data structure in order to offset the cost of
                 traversing the tree. In our benchmarks, the RRB-Vector
                 speedup for parallel operations is lower bounded by 7x
                 when executing on 4 CPUs of 8 cores each. The
                 performance for discrete operations, such as appending
                 on either end, or updating and removing elements, is
                 consistently good and compares favorably to the most
                 important immutable sequence collections in the
                 literature and in use today. The memory footprint of
                 RRB-Vector is on par with arrays and an order of
                 magnitude less than competing collections.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Jaskelioff:2015:FPS,
  author =       "Mauro Jaskelioff and Exequiel Rivas",
  title =        "Functional pearl: a smart view on datatypes",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "355--361",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784743",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Left-nested list concatenations, left-nested binds on
                 the free monad, and left-nested choices in many
                 non-determinism monads have an algorithmically bad
                 performance. Can we solve this problem without losing
                 the ability to pattern-match on the computation?
                 Surprisingly, there is a deceptively simple solution:
                 use a smart view to pattern-match on the datatype. We
                 introduce the notion of smart view and show how it
                 solves the problem of slow left-nested operations. In
                 particular, we use the technique to obtain fast and
                 simple implementations of lists, of free monads, and of
                 two non-determinism monads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Yang:2015:ECC,
  author =       "Edward Z. Yang and Giovanni Campagna and {\"O}mer S.
                 Agacan and Ahmed El-Hassany and Abhishek Kulkarni and
                 Ryan R. Newton",
  title =        "Efficient communication and collection with compact
                 normal forms",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "362--374",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784735",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In distributed applications, the transmission of
                 non-contiguous data structures is greatly slowed down
                 by the need to serialize them into a buffer before
                 sending. We describe Compact Normal Forms, an API that
                 allows programmers to explicitly place immutable heap
                 objects into regions, which can both be accessed like
                 ordinary data as well as efficiently transmitted over
                 the network. The process of placing objects into
                 compact regions (essentially a copy) is faster than any
                 serializer and can be amortized over a series of
                 functional updates to the data structure in question.
                 We implement this scheme in the Glasgow Haskell
                 Compiler and show that even with the space expansion
                 attendant with memory-oriented data structure
                 representations, we achieve between x2 and x4 speedups
                 on fast local networks with sufficiently large data
                 structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Keil:2015:BAH,
  author =       "Matthias Keil and Peter Thiemann",
  title =        "Blame assignment for higher-order contracts with
                 intersection and union",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "375--386",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784737",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an untyped calculus of blame assignment for
                 a higher-order contract system with two new operators:
                 intersection and union. The specification of these
                 operators is based on the corresponding type theoretic
                 constructions. This connection makes intersection and
                 union contracts their inevitable dynamic counterparts
                 with a range of desirable properties and makes them
                 suitable for subsequent integration in a gradual type
                 system. A denotational specification provides the
                 semantics of a contract in terms of two sets: a set of
                 terms satisfying the contract and a set of contexts
                 respecting the contract. This kind of specification for
                 contracts is novel and interesting in its own right. A
                 nondeterministic operational semantics serves as the
                 specification for contract monitoring and for proving
                 its correctness. It is complemented by a deterministic
                 semantics that is closer to an implementation and that
                 is connected to the nondeterministic semantics by
                 simulation. The calculus is the formal basis of TJS, a
                 language embedded, higher-order contract system
                 implemented for JavaScript.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Swords:2015:ECM,
  author =       "Cameron Swords and Amr Sabry and Sam Tobin-Hochstadt",
  title =        "Expressing contract monitors as patterns of
                 communication",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "387--399",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784742",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new approach to contract semantics which
                 expresses myriad monitoring strategies using a small
                 core of foundational communication primitives. This
                 approach allows multiple existing contract monitoring
                 approaches, ranging from Findler and Felleisen's
                 original model of higher-order contracts to semi-eager,
                 parallel, or asynchronous monitors, to be expressed in
                 a single language built on well-understood constructs.
                 We prove that this approach accurately simulates the
                 original semantics of higher-order contracts. A
                 straightforward implementation in Racket demonstrates
                 the practicality of our approach which not only
                 enriches existing Racket monitoring strategies, but
                 also support a new style of monitoring in which
                 collections of contracts collaborate to establish a
                 global invariant.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Zhu:2015:LRT,
  author =       "He Zhu and Aditya V. Nori and Suresh Jagannathan",
  title =        "Learning refinement types",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "400--411",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784766",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose the integration of a random test generation
                 system (capable of discovering program bugs) and a
                 refinement type system (capable of expressing and
                 verifying program invariants), for higher-order
                 functional programs, using a novel lightweight learning
                 algorithm as an effective intermediary between the two.
                 Our approach is based on the well-understood intuition
                 that useful, but difficult to infer, program properties
                 can often be observed from concrete program states
                 generated by tests; these properties act as likely
                 invariants, which if used to refine simple types, can
                 have their validity checked by a refinement type
                 checker. We describe an implementation of our technique
                 for a variety of benchmarks written in ML, and
                 demonstrate its effectiveness in inferring and proving
                 useful invariants for programs that express complex
                 higher-order control and dataflow.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Pavlinovic:2015:PSB,
  author =       "Zvonimir Pavlinovic and Tim King and Thomas Wies",
  title =        "Practical {SMT}-based type error localization",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "412--423",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784765",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compilers for statically typed functional programming
                 languages are notorious for generating confusing type
                 error messages. When the compiler detects a type error,
                 it typically reports the program location where the
                 type checking failed as the source of the error. Since
                 other error sources are not even considered, the actual
                 root cause is often missed. A more adequate approach is
                 to consider all possible error sources and report the
                 most useful one subject to some usefulness criterion.
                 In our previous work, we showed that this approach can
                 be formulated as an optimization problem related to
                 satisfiability modulo theories (SMT). This formulation
                 cleanly separates the heuristic nature of usefulness
                 criteria from the underlying search problem.
                 Unfortunately, algorithms that search for an optimal
                 error source cannot directly use principal types which
                 are crucial for dealing with the exponential-time
                 complexity of the decision problem of polymorphic type
                 checking. In this paper, we present a new algorithm
                 that efficiently finds an optimal error source in a
                 given ill-typed program. Our algorithm uses an improved
                 SMT encoding to cope with the high complexity of
                 polymorphic typing by iteratively expanding the typing
                 constraints from which principal types are derived. The
                 algorithm preserves the clean separation between the
                 heuristics and the actual search. We have implemented
                 our algorithm for OCaml. In our experimental
                 evaluation, we found that the algorithm reduces the
                 running times for optimal type error localization from
                 minutes to seconds and scales better than previous
                 localization algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Karachalias:2015:GMT,
  author =       "Georgios Karachalias and Tom Schrijvers and Dimitrios
                 Vytiniotis and Simon Peyton Jones",
  title =        "{GADTs} meet their match: pattern-matching warnings
                 that account for {GADTs}, guards, and laziness",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "9",
  pages =        "424--436",
  month =        sep,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858949.2784748",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "For ML and Haskell, accurate warnings when a function
                 definition has redundant or missing patterns are
                 mission critical. But today's compilers generate bogus
                 warnings when the programmer uses guards (even simple
                 ones), GADTs, pattern guards, or view patterns. We give
                 the first algorithm that handles all these cases in a
                 single, uniform framework, together with an
                 implementation in GHC, and evidence of its utility in
                 practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '15 conference proceedings.",
}

@Article{Hague:2015:DRC,
  author =       "Matthew Hague and Anthony W. Lin and C.-H. Luke Ong",
  title =        "Detecting redundant {CSS} rules in {HTML5}
                 applications: a tree rewriting approach",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "1--19",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814288",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "HTML5 applications normally have a large set of CSS
                 (Cascading Style Sheets) rules for data display. Each
                 CSS rule consists of a node selector and a declaration
                 block (which assigns values to selected nodes' display
                 attributes). As web applications evolve, maintaining
                 CSS files can easily become problematic. Some CSS rules
                 will be replaced by new ones, but these obsolete (hence
                 redundant) CSS rules often remain in the applications.
                 Not only does this ``bloat'' the applications ---
                 increasing the bandwidth requirement --- but it also
                 significantly increases web browsers' processing time.
                 Most works on detecting redundant CSS rules in HTML5
                 applications do not consider the dynamic behaviours of
                 HTML5 (specified in JavaScript); in fact, the only
                 proposed method that takes these into account is
                 dynamic analysis, which cannot soundly prove redundancy
                 of CSS rules. In this paper, we introduce an
                 abstraction of HTML5 applications based on monotonic
                 tree-rewriting and study its ``redundancy problem''. We
                 establish the precise complexity of the problem and
                 various subproblems of practical importance (ranging
                 from P to EXP). In particular, our algorithm relies on
                 an efficient reduction to an analysis of symbolic
                 pushdown systems (for which highly optimised solvers
                 are available), which yields a fast method for checking
                 redundancy in practice. We implemented our algorithm
                 and demonstrated its efficacy in detecting redundant
                 CSS rules in HTML5 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Demsky:2015:SSD,
  author =       "Brian Demsky and Patrick Lam",
  title =        "{SATCheck}: {SAT}-directed stateless model checking
                 for {SC} and {TSO}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "20--36",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814297",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Writing low-level concurrent code is well known to be
                 challenging and error prone. The widespread deployment
                 of multi-core hardware and the shift towards using
                 low-level concurrent data structures has moved the
                 problem into the mainstream. Finding bugs in such code
                 may require finding a specific bug-revealing thread
                 interleaving out of a huge space of parallel
                 executions. Model-checking is a powerful technique for
                 exhaustively testing code. However, scaling model
                 checking presents a significant challenge. In this
                 paper we present a new and more scalable technique for
                 model checking concurrent code, based on concrete
                 execution. Our technique observes concrete behaviors,
                 builds a model of these behaviors, encodes the model in
                 SAT, and leverages SAT solver technology to find
                 executions that reveal new behaviors. It then runs the
                 new execution, incorporates the newly observed
                 behavior, and repeats the process until it has explored
                 all reachable behaviors. We have implemented a
                 prototype of our approach in the SATCheck tool. Our
                 tool supports both the Total Store Ordering (TSO) and
                 Sequentially Consistent (SC) memory models. We evaluate
                 SATCheck by testing several concurrent data structure
                 implementations and comparing its performance to the
                 original DPOR stateless model checking algorithm
                 implemented in CDSChecker, the source DPOR algorithm
                 implemented in Nidhugg, and CheckFence. Our experiments
                 show that SATCheck scales better than previous
                 approaches while at the same time operating on concrete
                 executions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Kuraj:2015:PES,
  author =       "Ivan Kuraj and Viktor Kuncak and Daniel Jackson",
  title =        "Programming with enumerable sets of structures",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "37--56",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814323",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an efficient, modular, and feature-rich
                 framework for automated generation and validation of
                 complex structures, suitable for tasks that explore a
                 large space of structured values. Our framework is
                 capable of exhaustive, incremental, parallel, and
                 memoized enumeration from not only finite but also
                 infinite domains, while providing fine-grained control
                 over the process. Furthermore, the framework
                 efficiently supports the inverse of enumeration
                 (checking whether a structure can be generated and
                 fast-forwarding to this structure to continue the
                 enumeration) and lazy enumeration (achieving exhaustive
                 testing without generating all structures). The
                 foundation of efficient enumeration lies in both direct
                 access to encoded structures, achieved with well-known
                 and new pairing functions, and dependent enumeration,
                 which embeds constraints into the enumeration to avoid
                 backtracking. Our framework defines an algebra of
                 enumerators, with combinators for their composition
                 that preserve exhaustiveness and efficiency. We have
                 implemented our framework as a domain-specific language
                 in Scala. Our experiments demonstrate better
                 performance and shorter specifications by up to a few
                 orders of magnitude compared to existing approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Jensen:2015:SMC,
  author =       "Casper S. Jensen and Anders M{\o}ller and Veselin
                 Raychev and Dimitar Dimitrov and Martin Vechev",
  title =        "Stateless model checking of event-driven
                 applications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "57--73",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814282",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern event-driven applications, such as, web pages
                 and mobile apps, rely on asynchrony to ensure smooth
                 end-user experience. Unfortunately, even though these
                 applications are executed by a single event-loop
                 thread, they can still exhibit nondeterministic
                 behaviors depending on the execution order of
                 interfering asynchronous events. As in classic
                 shared-memory concurrency, this nondeterminism makes it
                 challenging to discover errors that manifest only in
                 specific schedules of events. In this work we propose
                 the first stateless model checker for event-driven
                 applications, called R4. Our algorithm systematically
                 explores the nondeterminism in the application and
                 concisely exposes its overall effect, which is useful
                 for bug discovery. The algorithm builds on a
                 combination of three key insights: (i) a dynamic
                 partial order reduction (DPOR) technique for reducing
                 the search space, tailored to the domain of
                 event-driven applications, (ii) conflict-reversal
                 bounding based on a hypothesis that most errors occur
                 with a small number of event reorderings, and (iii)
                 approximate replay of event sequences, which is
                 critical for separating harmless from harmful
                 nondeterminism. We instantiate R4 for the domain of
                 client-side web applications and use it to analyze
                 event interference in a number of real-world programs.
                 The experimental results indicate that the precision
                 and overall exploration capabilities of our system
                 significantly exceed that of existing techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Hottelier:2015:SLE,
  author =       "Thibaud Hottelier and Rastislav Bodik",
  title =        "Synthesis of layout engines from relational
                 constraints",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "74--88",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814291",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an algorithm for synthesizing efficient
                 document layout engines from compact relational
                 specifications. These specifications are compact in
                 that a single specification can produce multiple
                 engines, each for a distinct layout situation, i.e., a
                 different combination of known vs. unknown attributes.
                 Technically, our specifications are relational
                 attribute grammars, while our engines are functional
                 attribute grammars. By synthesizing functions from
                 relational constraints, we obviate the need for
                 constraint solving at runtime, because functional
                 attribute grammars can be easily evaluated according to
                 a fixed schedule, sidestepping the backtracking search
                 performed by constraint solvers. Our experiments show
                 that we can generate layout engines for non-trivial
                 data visualizations, and that our synthesized engines
                 are between 39- and 200-times faster than
                 general-purpose constraint solvers. Relational
                 specifications of layout give rise to synthesis
                 problems that have previously proved intractable. Our
                 algorithm exploits the hierarchical, grammar-based
                 structure of the specification, decomposing the
                 specification into smaller subproblems, which can be
                 tackled with off-the-shelf synthesis procedures. The
                 new synthesis problem then becomes the composition of
                 the functions thus generated into a correct attribute
                 grammar, which might be recursive. We show how to solve
                 this problem by efficient reduction to an SMT
                 problem.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Erdweg:2015:SOI,
  author =       "Sebastian Erdweg and Moritz Lichter and Manuel Weiel",
  title =        "A sound and optimal incremental build system with
                 dynamic dependencies",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "89--106",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814316",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Build systems are used in all but the smallest
                 software projects to invoke the right build tools on
                 the right files in the right order. A build system must
                 be sound (after a build, generated files consistently
                 reflect the latest source files) and efficient (recheck
                 and rebuild as few build units as possible).
                 Contemporary build systems provide limited efficiency
                 because they lack support for expressing fine-grained
                 file dependencies. We present a build system called
                 pluto that supports the definition of reusable,
                 parameterized, interconnected builders. When run, a
                 builder notifies the build system about dynamically
                 required and produced files as well as about other
                 builders whose results are needed. To support
                 fine-grained file dependencies, we generalize the
                 traditional notion of time stamps to allow builders to
                 declare their actual requirements on a file's content.
                 pluto collects the requirements and products of a
                 builder with their stamps in a build summary. This
                 enables pluto to provides provably sound and optimal
                 incremental rebuilding. To support dynamic
                 dependencies, our rebuild algorithm interleaves
                 dependency analysis and builder execution and enforces
                 invariants on the dependency graph through a dynamic
                 analysis. We have developed pluto as a Java API and
                 used it to implement more than 25 builders. We describe
                 our experience with migrating a larger Ant build script
                 to pluto and compare the respective build times.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Polozov:2015:FFI,
  author =       "Oleksandr Polozov and Sumit Gulwani",
  title =        "{FlashMeta}: a framework for inductive program
                 synthesis",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "107--126",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814310",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Inductive synthesis, or programming-by-examples (PBE)
                 is gaining prominence with disruptive applications for
                 automating repetitive tasks in end-user programming.
                 However, designing, developing, and maintaining an
                 effective industrial-quality inductive synthesizer is
                 an intellectual and engineering challenge, requiring
                 1-2 man-years of effort. Our novel observation is that
                 many PBE algorithms are a natural fall-out of one
                 generic meta-algorithm and the domain-specific
                 properties of the operators in the underlying
                 domain-specific language (DSL). The meta-algorithm
                 propagates example-based constraints on an expression
                 to its subexpressions by leveraging associated witness
                 functions, which essentially capture the inverse
                 semantics of the underlying operator. This observation
                 enables a novel program synthesis methodology called
                 data-driven domain-specific deduction (D4), where
                 domain-specific insight, provided by the DSL designer,
                 is separated from the synthesis algorithm. Our
                 FlashMeta framework implements this methodology,
                 allowing synthesizer developers to generate an
                 efficient synthesizer from the mere DSL definition (if
                 properties of the DSL operators have been modeled). In
                 our case studies, we found that 10+ existing
                 industrial-quality mass-market applications based on
                 PBE can be cast as instances of D4. Our evaluation
                 includes reimplementation of some prior works, which in
                 FlashMeta become more efficient, maintainable, and
                 extensible. As a result, FlashMeta-based PBE tools are
                 deployed in several industrial products, including
                 Microsoft PowerShell 3.0 for Windows 10, Azure
                 Operational Management Suite, and Microsoft Cortana
                 digital assistant.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Zhang:2015:SYB,
  author =       "Haoyuan Zhang and Zewei Chu and Bruno C. d. S.
                 Oliveira and Tijs van der Storm",
  title =        "Scrap your boilerplate with object algebras",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "127--146",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814279",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Traversing complex Abstract Syntax Trees (ASTs)
                 typically requires large amounts of tedious boilerplate
                 code. For many operations most of the code simply walks
                 the structure, and only a small portion of the code
                 implements the functionality that motivated the
                 traversal in the first place. This paper presents a
                 type-safe Java framework called Shy that removes much
                 of this boilerplate code. In Shy object algebras are
                 used to describe complex and extensible AST structures.
                 Using Java annotations Shy generates generic
                 boilerplate code for various types of traversals. For a
                 concrete traversal, users of Shy can then inherit from
                 the generated code and override only the interesting
                 cases. Consequently, the amount of code that users need
                 to write is significantly smaller. Moreover, traversals
                 using the Shy framework are also much more structure
                 shy, becoming more adaptive to future changes or
                 extensions to the AST structure. To prove the
                 effectiveness of the approach, we applied Shy in the
                 implementation of a domain-specific questionnaire
                 language. Our results show that for a large number of
                 traversals there was a significant reduction in the
                 amount of user-defined code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Sharma:2015:CCS,
  author =       "Rahul Sharma and Eric Schkufza and Berkeley Churchill
                 and Alex Aiken",
  title =        "Conditionally correct superoptimization",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "147--162",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814278",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The aggressive optimization of heavily used kernels is
                 an important problem in high-performance computing.
                 However, both general purpose compilers and highly
                 specialized tools such as superoptimizers often do not
                 have sufficient static knowledge of restrictions on
                 program inputs that could be exploited to produce the
                 very best code. For many applications, the best
                 possible code is conditionally correct: the optimized
                 kernel is equal to the code that it replaces only under
                 certain preconditions on the kernel's inputs. The main
                 technical challenge in producing conditionally correct
                 optimizations is in obtaining non-trivial and useful
                 conditions and proving conditional equivalence formally
                 in the presence of loops. We combine abstract
                 interpretation, decision procedures, and testing to
                 yield a verification strategy that can address both of
                 these problems. This approach yields a superoptimizer
                 for x86 that in our experiments produces binaries that
                 are often multiple times faster than those produced by
                 production compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Blackshear:2015:SCF,
  author =       "Sam Blackshear and Bor-Yuh Evan Chang and Manu
                 Sridharan",
  title =        "Selective control-flow abstraction via jumping",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "163--182",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814293",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present jumping, a form of selective control-flow
                 abstraction useful for improving the scalability of
                 goal-directed static analyses. Jumping is useful for
                 analyzing programs with complex control-flow such as
                 event-driven systems. In such systems, accounting for
                 orderings between certain events is important for
                 precision, yet analyzing the product graph of all
                 possible event orderings is intractable. Jumping solves
                 this problem by allowing the analysis to selectively
                 abstract away control-flow between events irrelevant to
                 a goal query while preserving information about the
                 ordering of relevant events. We present a framework for
                 designing sound jumping analyses and create an
                 instantiation of the framework for performing precise
                 inter-event analysis of Android applications. Our
                 experimental evaluation showed that using jumping to
                 augment a precise goal-directed analysis with
                 inter-event reasoning enabled our analysis to prove
                 90-97\% of dereferences safe across our benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Madhavan:2015:AGC,
  author =       "Ravichandhran Madhavan and Mika{\"e}l Mayer and Sumit
                 Gulwani and Viktor Kuncak",
  title =        "Automating grammar comparison",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "183--200",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814304",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We consider from a practical perspective the problem
                 of checking equivalence of context-free grammars. We
                 present techniques for proving equivalence, as well as
                 techniques for finding counter-examples that establish
                 non-equivalence. Among the key building blocks of our
                 approach is a novel algorithm for efficiently
                 enumerating and sampling words and parse trees from
                 arbitrary context-free grammars; the algorithm supports
                 polynomial time random access to words belonging to the
                 grammar. Furthermore, we propose an algorithm for
                 proving equivalence of context-free grammars that is
                 complete for LL grammars, yet can be invoked on any
                 context-free grammar, including ambiguous grammars. Our
                 techniques successfully find discrepancies between
                 different syntax specifications of several real-world
                 languages, and are capable of detecting fine-grained
                 incremental modifications performed on grammars. Our
                 evaluation shows that our tool improves significantly
                 on the existing available state of the art tools. In
                 addition, we used these algorithms to develop an online
                 tutoring system for grammars that we then used in an
                 undergraduate course on computer language processing.
                 On questions involving grammar constructions, our
                 system was able to automatically evaluate the
                 correctness of 95\% of the solutions submitted by
                 students: it disproved 74\% of cases and proved 21\% of
                 them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Ntzik:2015:RAP,
  author =       "Gian Ntzik and Philippa Gardner",
  title =        "Reasoning about the {POSIX} file system: local update
                 and global pathnames",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "201--220",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814306",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a program logic for specifying a core
                 sequential subset of the POSIX file system and for
                 reasoning abstractly about client programs working with
                 the file system. The challenge is to reason about the
                 combination of local directory update and global
                 pathname traversal (including '..' and symbolic links)
                 which may overlap the directories being updated.
                 Existing reasoning techniques are either based on
                 first-order logic and do not scale, or on separation
                 logic and can only handle linear pathnames (no '..' or
                 symbolic links). We introduce fusion logic for
                 reasoning about local update and global pathname
                 traversal, introducing a novel effect frame rule to
                 propagate the effect of a local update on overlapping
                 pathnames. We apply our reasoning to the standard
                 recursive remove utility ({\tt rm -r}), discovering
                 bugs in well-known implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Ou:2015:AAI,
  author =       "Peizhao Ou and Brian Demsky",
  title =        "{AutoMO}: automatic inference of memory order
                 parameters for {C\slash C++11}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "221--240",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814286",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many concurrent data structures are initially designed
                 for the sequential consistency (SC) memory model.
                 Developers often implement these data structures on
                 real-world systems with weaker memory models by adding
                 sufficient fences to ensure that their implementation
                 on the weak memory model exhibits the same executions
                 as the SC memory model. Recently, the C11 and C++11
                 standards have added a weak memory model to the C and
                 C++ languages. Developing and debugging code for weak
                 memory models can be extremely challenging. We present
                 AutoMO, a framework to support porting data structures
                 designed for the SC memory model to the C/C++11 memory
                 model. AutoMO provides support across the porting
                 process: (1) it automatically infers initial settings
                 for the memory order parameters, (2) it detects whether
                 a C/C++11 execution is equivalent to some SC execution,
                 and (3) it simplifies traces to make them easier to
                 understand. We have used AutoMO to successfully infer
                 memory order parameters for a range of data structures
                 and to check whether executions of several concurrent
                 data structure implementations are SC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Biswas:2015:VES,
  author =       "Swarnendu Biswas and Minjia Zhang and Michael D. Bond
                 and Brandon Lucia",
  title =        "{Valor}: efficient, software-only region conflict
                 exceptions",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "241--259",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814292",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data races complicate programming language semantics,
                 and a data race is often a bug. Existing techniques
                 detect data races and define their semantics by
                 detecting conflicts between synchronization-free
                 regions (SFRs). However, such techniques either modify
                 hardware or slow programs dramatically, preventing
                 always-on use today. This paper describes Valor, a
                 sound, precise, software-only region conflict detection
                 analysis that achieves high performance by eliminating
                 the costly analysis on each read operation that prior
                 approaches require. Valor instead logs a region's reads
                 and lazily detects conflicts for logged reads when the
                 region ends. As a comparison, we have also developed
                 FastRCD, a conflict detector that leverages the epoch
                 optimization strategy of the FastTrack data race
                 detector. We evaluate Valor, FastRCD, and FastTrack,
                 showing that Valor dramatically outperforms FastRCD and
                 FastTrack. Valor is the first region conflict detector
                 to provide strong semantic guarantees for racy program
                 executions with under 2X slowdown. Overall, Valor
                 advances the state of the art in always-on support for
                 strong behavioral guarantees for data races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Cohen:2015:AMR,
  author =       "Nachshon Cohen and Erez Petrank",
  title =        "Automatic memory reclamation for lock-free data
                 structures",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "260--279",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814298",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Lock-free data-structures are widely employed in
                 practice, yet designing lock-free memory reclamation
                 for them is notoriously difficult. In particular, all
                 known lock-free reclamation schemes are ``manual'' in
                 the sense that the developer has to specify when nodes
                 have retired and may be reclaimed. Retiring nodes
                 adequately is non-trivial and often requires the
                 modification of the original lock-free algorithm. In
                 this paper we present an automatic lock-free
                 reclamation scheme for lock-free data-structures in the
                 spirit of a mark-sweep garbage collection. The proposed
                 algorithm works with any normalized lock-free algorithm
                 and with no need for the programmer to retire nodes or
                 make changes to the algorithm. Evaluation of the
                 proposed scheme on a linked-list and a hash table shows
                 that it performs similarly to the best manual
                 (lock-free) memory reclamation scheme.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Lopez:2015:PBV,
  author =       "Hugo A. L{\'o}pez and Eduardo R. B. Marques and
                 Francisco Martins and Nicholas Ng and C{\'e}sar Santos
                 and Vasco Thudichum Vasconcelos and Nobuko Yoshida",
  title =        "Protocol-based verification of message-passing
                 parallel programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "280--298",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814302",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present ParTypes, a type-based methodology for the
                 verification of Message Passing Interface (MPI)
                 programs written in the C programming language. The aim
                 is to statically verify programs against protocol
                 specifications, enforcing properties such as fidelity
                 and absence of deadlocks. We develop a protocol
                 language based on a dependent type system for
                 message-passing parallel programs, which includes
                 various communication operators, such as point-to-point
                 messages, broadcast, reduce, array scatter and gather.
                 For the verification of a program against a given
                 protocol, the protocol is first translated into a
                 representation read by VCC, a software verifier for C.
                 We successfully verified several MPI programs in a
                 running time that is independent of the number of
                 processes or other input parameters. This contrasts
                 with alternative techniques, notably model checking and
                 runtime verification, that suffer from the
                 state-explosion problem or that otherwise depend on
                 parameters to the program itself. We experimentally
                 evaluated our approach against state-of-the-art tools
                 for MPI to conclude that our approach offers a scalable
                 solution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Bastani:2015:IVA,
  author =       "Osbert Bastani and Saswat Anand and Alex Aiken",
  title =        "Interactively verifying absence of explicit
                 information flows in {Android} apps",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "299--315",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814274",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "App stores are increasingly the preferred mechanism
                 for distributing software, including mobile apps
                 (Google Play), desktop apps (Mac App Store and Ubuntu
                 Software Center), computer games (the Steam Store), and
                 browser extensions (Chrome Web Store). The centralized
                 nature of these stores has important implications for
                 security. While app stores have unprecedented ability
                 to audit apps, users now trust hosted apps, making them
                 more vulnerable to malware that evades detection and
                 finds its way onto the app store. Sound static explicit
                 information flow analysis has the potential to
                 significantly aid human auditors, but it is handicapped
                 by high false positive rates. Instead, auditors
                 currently rely on a combination of dynamic analysis
                 (which is unsound) and lightweight static analysis
                 (which cannot identify information flows) to help
                 detect malicious behaviors. We propose a process for
                 producing apps certified to be free of malicious
                 explicit information flows. In practice, imprecision in
                 the reachability analysis is a major source of false
                 positive information flows that are difficult to
                 understand and discharge. In our approach, the
                 developer provides tests that specify what code is
                 reachable, allowing the static analysis to restrict its
                 search to tested code. The app hosted on the store is
                 instrumented to enforce the provided specification
                 (i.e., executing untested code terminates the app). We
                 use abductive inference to minimize the necessary
                 instrumentation, and then interact with the developer
                 to ensure that the instrumentation only cuts
                 unreachable code. We demonstrate the effectiveness of
                 our approach in verifying a corpus of 77 Android
                 apps-our interactive verification process successfully
                 discharges 11 out of the 12 false positives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Brutschy:2015:SGD,
  author =       "Lucas Brutschy and Pietro Ferrara and Omer Tripp and
                 Marco Pistoia",
  title =        "{ShamDroid}: gracefully degrading functionality in the
                 presence of limited resource access",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "316--331",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814296",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Given a program whose functionality depends on access
                 to certain external resources, we investigate the
                 question of how to gracefully degrade functionality
                 when a subset of those resources is unavailable. The
                 concrete setting motivating this problem statement is
                 mobile applications, which rely on contextual data
                 (e.g., device identifiers, user location and contacts,
                 etc.) to fulfill their functionality. In particular, we
                 focus on the Android platform, which mediates access to
                 resources via an installation-time permission model. On
                 the one hand, granting an app the permission to access
                 a resource (e.g., the device ID) entails privacy
                 threats (e.g., releasing the device ID to advertising
                 servers). On the other hand, denying access to a
                 resource could render the app useless (e.g., if
                 inability to read the device ID is treated as an error
                 state). Our goal is to specialize an existing Android
                 app in such a way that it is disabled from accessing
                 certain sensitive resources (or contextual data) as
                 specified by the user, while still being able to
                 execute functionality that does not depend on those
                 resources. We present ShamDroid, a program
                 transformation algorithm, based on specialized forms of
                 program slicing, backwards static analysis and
                 constraint solving, that enables the use of Android
                 apps with partial permissions. We rigorously state the
                 guarantees provided by ShamDroid w.r.t. functionality
                 maximization. We provide an evaluation over the top 500
                 Google Play apps and report on an extensive comparative
                 evaluation of ShamDroid against three other
                 state-of-the-art solutions (APM, XPrivacy, and Google
                 App Ops) that mediate resource access at the system
                 (rather than app) level. ShamDroid performs better than
                 all of these tools by a significant margin, leading to
                 abnormal behavior in only 1 out of 27 apps we manually
                 investigated, compared to the other solutions, which
                 cause crashes and abnormalities in 9 or more of the
                 apps. This demonstrates the importance of performing
                 app-sensitive mocking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Bielik:2015:SRD,
  author =       "Pavol Bielik and Veselin Raychev and Martin Vechev",
  title =        "Scalable race detection for {Android} applications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "332--348",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814303",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a complete end-to-end dynamic analysis
                 system for finding data races in mobile Android
                 applications. The capabilities of our system
                 significantly exceed the state of the art: our system
                 can analyze real-world application interactions in
                 minutes rather than hours, finds errors inherently
                 beyond the reach of existing approaches, while still
                 (critically) reporting very few false positives. Our
                 system is based on three key concepts: (i) a thorough
                 happens-before model of Android-specific concurrency,
                 (ii) a scalable analysis algorithm for efficiently
                 building and querying the happens-before graph, and
                 (iii) an effective set of domain-specific filters that
                 reduce the number of reported data races by several
                 orders of magnitude. We evaluated the usability and
                 performance of our system on 354 real-world Android
                 applications (e.g., Facebook). Our system analyzes a
                 minute of end-user interaction with the application in
                 about 24 seconds, while current approaches take hours
                 to complete. Inspecting the results for 8 large
                 open-source applications revealed 15 harmful bugs of
                 diverse kinds. Some of the bugs we reported were
                 confirmed and fixed by developers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Hu:2015:VYL,
  author =       "Yongjian Hu and Tanzirul Azim and Iulian Neamtiu",
  title =        "Versatile yet lightweight record-and-replay for
                 {Android}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "349--366",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814320",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recording and replaying the execution of smartphone
                 apps is useful in a variety of contexts, from
                 reproducing bugs to profiling and testing. Achieving
                 effective record-and-replay is a balancing act between
                 accuracy and overhead. On smartphones, the act is
                 particularly complicated, because smartphone apps
                 receive a high-bandwidth stream of input (e.g.,
                 network, GPS, camera, microphone, touchscreen) and
                 concurrency events, but the stream has to be recorded
                 and replayed with minimal overhead, to avoid
                 interfering with app execution. Prior record-and-replay
                 approaches have focused on replaying machine
                 instructions or system calls, which is not a good fit
                 on smartphones. We propose a novel, stream-oriented
                 record-and-replay approach which achieves high-accuracy
                 and low-overhead by aiming at a sweet spot: recording
                 and replaying sensor and network input, event
                 schedules, and inter-app communication via intents. To
                 demonstrate the versatility of our approach, we have
                 constructed a tool named VALERA that supports
                 record-and-replay on the Android platform. VALERA works
                 with apps running directly on the phone, and does not
                 require access to the app source code. Through an
                 evaluation on 50 popular Android apps, we show that:
                 VALERA's replay fidelity far exceeds current
                 record-and-replay approaches for Android; VALERA's
                 precise timing control and low overhead (about 1\% for
                 either record or replay) allows it to replay
                 high-throughput, timing-sensitive apps such as
                 video/audio capture and recognition; and VALERA's
                 support for event schedule replay enables the
                 construction of useful analyses, such as reproducing
                 event-driven race bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Bender:2015:DFI,
  author =       "John Bender and Mohsen Lesani and Jens Palsberg",
  title =        "Declarative fence insertion",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "367--385",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814318",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Previous work has shown how to insert fences that
                 enforce sequential consistency. However, for many
                 concurrent algorithms, sequential consistency is
                 unnecessarily strong and can lead to high execution
                 overhead. The reason is that, often, correctness relies
                 on the execution order of a few specific pairs of
                 instructions. Algorithm designers can declare those
                 execution orders and thereby enable
                 memory-model-independent reasoning about correctness
                 and also ease implementation of algorithms on multiple
                 platforms. The literature has examples of such
                 reasoning, while tool support for enforcing the orders
                 has been lacking until now. In this paper we present a
                 declarative approach to specify and enforce execution
                 orders. Our fence insertion algorithm first identifies
                 the execution orders that a given memory model enforces
                 automatically, and then inserts fences that enforce the
                 rest. Our benchmarks include three off-the-shelf
                 transactional memory algorithms written in C/C++ for
                 which we specify suitable execution orders. For those
                 benchmarks, our experiments with the x86 and ARMv7
                 memory models show that our tool inserts fences that
                 are competitive with those inserted by the original
                 authors. Our tool is the first to insert fences into
                 transactional memory algorithms and it solves the
                 long-standing problem of how to easily port such
                 algorithms to a novel memory model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Le:2015:FDC,
  author =       "Vu Le and Chengnian Sun and Zhendong Su",
  title =        "Finding deep compiler bugs via guided stochastic
                 program mutation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "386--399",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814319",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compiler testing is important and challenging.
                 Equivalence Modulo Inputs (EMI) is a recent promising
                 approach for compiler validation. It is based on
                 mutating the unexecuted statements of an existing
                 program under some inputs to produce new equivalent
                 test programs w.r.t. these inputs. Orion is a simple
                 realization of EMI by only randomly deleting unexecuted
                 statements. Despite its success in finding many bugs in
                 production compilers, Orion's effectiveness is still
                 limited by its simple, blind mutation strategy. To more
                 effectively realize EMI, this paper introduces a
                 guided, advanced mutation strategy based on Bayesian
                 optimization. Our goal is to generate diverse programs
                 to more thoroughly exercise compilers. We achieve this
                 with two techniques: (1) the support of both code
                 deletions and insertions in the unexecuted regions,
                 leading to a much larger test program space; and (2)
                 the use of an objective function that promotes
                 control-flow-diverse programs for guiding Markov Chain
                 Monte Carlo (MCMC) optimization to explore the search
                 space. Our technique helps discover deep bugs that
                 require elaborate mutations. Our realization, Athena,
                 targets C compilers. In 19 months, Athena has found 72
                 new bugs --- many of which are deep and important bugs
                 --- in GCC and LLVM. Developers have confirmed all 72
                 bugs and fixed 68 of them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Wang:2015:VAR,
  author =       "Haichuan Wang and David Padua and Peng Wu",
  title =        "Vectorization of {Apply} to reduce interpretation
                 overhead of {R}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "400--415",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814273",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "R is a popular dynamic language designed for
                 statistical computing. Despite R's huge user base, the
                 inefficiency in R's language implementation becomes a
                 major pain-point in everyday use as well as an obstacle
                 to apply R to solve large scale analytics problems. The
                 two most common approaches to improve the performance
                 of dynamic languages are: implementing more efficient
                 interpretation strategies and extending the interpreter
                 with Just-In-Time (JIT) compiler. However, both
                 approaches require significant changes to the
                 interpreter, and complicate the adoption by development
                 teams as a result. This paper presents a new approach
                 to improve execution efficiency of R programs by
                 vectorizing the widely used Apply class of operations.
                 Apply accepts two parameters: a function and a
                 collection of input data elements. The standard
                 implementation of Apply iteratively invokes the input
                 function with each element in the data collection. Our
                 approach combines data transformation and function
                 vectorization to convert the looping-over-data
                 execution of the standard Apply into a single
                 invocation of a vectorized function that contains a
                 sequence of vector operations over the input data. This
                 conversion can significantly speed-up the execution of
                 Apply operations in R by reducing the number of
                 interpretation steps. We implemented the vectorization
                 transformation as an R package. To enable the
                 optimization, all that is needed is to invoke the
                 package, and the user can use a normal R interpreter
                 without any changes. The evaluation shows that the
                 proposed method delivers significant performance
                 improvements for a collection of data analysis
                 algorithm benchmarks. This is achieved without any
                 native code generation and using only a single-thread
                 of execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Gvero:2015:SJE,
  author =       "Tihomir Gvero and Viktor Kuncak",
  title =        "Synthesizing {Java} expressions from free-form
                 queries",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "416--432",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814295",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new code assistance tool for integrated
                 development environments. Our system accepts as input
                 free-form queries containing a mixture of English and
                 Java, and produces Java code expressions that take the
                 query into account and respect syntax, types, and
                 scoping rules of Java, as well as statistical usage
                 patterns. In contrast to solutions based on code
                 search, the results returned by our tool need not
                 directly correspond to any previously seen code
                 fragment. As part of our system we have constructed a
                 probabilistic context free grammar for Java constructs
                 and library invocations, as well as an algorithm that
                 uses a customized natural language processing tool
                 chain to extract information from free-form text
                 queries. We present the results on a number of examples
                 showing that our technique (1) often produces the
                 expected code fragments, (2) tolerates much of the
                 flexibility of natural language, and (3) can repair
                 incorrect Java expressions that use, for example, the
                 wrong syntax or missing arguments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Zheng:2015:APP,
  author =       "Yudi Zheng and Lubom{\'\i}r Bulej and Walter Binder",
  title =        "Accurate profiling in the presence of dynamic
                 compilation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "433--450",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814281",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many profilers based on bytecode instrumentation yield
                 wrong results in the presence of an optimizing dynamic
                 compiler, either due to not being aware of
                 optimizations such as stack allocation and method
                 inlining, or due to the inserted code disrupting such
                 optimizations. To avoid such perturbations, we present
                 a novel technique to make any profiler implemented at
                 the bytecode level aware of optimizations performed by
                 the dynamic compiler. We implement our approach in a
                 state-of-the-art Java virtual machine and demonstrate
                 its significance with concrete profilers. We quantify
                 the impact of escape analysis on allocation profiling,
                 object life-time analysis, and the impact of method
                 inlining on callsite profiling. We illustrate how our
                 approach enables new kinds of profilers, such as a
                 profiler for non-inlined callsites, and a testing
                 framework for locating performance bugs in dynamic
                 compiler implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Aigner:2015:FMS,
  author =       "Martin Aigner and Christoph M. Kirsch and Michael
                 Lippautz and Ana Sokolova",
  title =        "Fast, multicore-scalable, low-fragmentation memory
                 allocation through large virtual memory and global data
                 structures",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "451--469",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814294",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We demonstrate that general-purpose memory allocation
                 involving many threads on many cores can be done with
                 high performance, multicore scalability, and low memory
                 consumption. For this purpose, we have designed and
                 implemented scalloc, a concurrent allocator that
                 generally performs and scales in our experiments better
                 than other allocators while using less memory, and is
                 still competitive otherwise. The main ideas behind the
                 design of scalloc are: uniform treatment of small and
                 big objects through so-called virtual spans,
                 efficiently and effectively reclaiming free memory
                 through fast and scalable global data structures, and
                 constant-time (modulo synchronization) allocation and
                 deallocation operations that trade off memory reuse and
                 spatial locality without being subject to false
                 sharing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Boston:2015:PTI,
  author =       "Brett Boston and Adrian Sampson and Dan Grossman and
                 Luis Ceze",
  title =        "Probability type inference for flexible approximate
                 programming",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "470--487",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814301",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In approximate computing, programs gain efficiency by
                 allowing occasional errors. Controlling the
                 probabilistic effects of this approximation remains a
                 key challenge. We propose a new approach where
                 programmers use a type system to communicate high-level
                 constraints on the degree of approximation. A
                 combination of type inference, code specialization, and
                 optional dynamic tracking makes the system expressive
                 and convenient. The core type system captures the
                 probability that each operation exhibits an error and
                 bounds the probability that each expression deviates
                 from its correct value. Solver-aided type inference
                 lets the programmer specify the correctness probability
                 on only some variables-program outputs, for example-and
                 automatically fills in other types to meet these
                 specifications. An optional dynamic type helps cope
                 with complex run-time behavior where static approaches
                 are insufficient. Together, these features interact to
                 yield a high degree of programmer control while
                 offering a strong soundness guarantee. We use existing
                 approximate-computing benchmarks to show how our
                 language, DECAF, maintains a low annotation burden. Our
                 constraint-based approach can encode hardware details,
                 such as finite degrees of reliability, so we also use
                 DECAF to examine implications for approximate hardware
                 design. We find that multi-level architectures can
                 offer advantages over simpler two-level machines and
                 that solver-aided optimization improves efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Jantz:2015:CLM,
  author =       "Michael R. Jantz and Forrest J. Robinson and Prasad A.
                 Kulkarni and Kshitij A. Doshi",
  title =        "Cross-layer memory management for managed language
                 applications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "488--504",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814322",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Performance and energy efficiency in memory have
                 become critically important for a wide range of
                 computing domains. However, it is difficult to control
                 and optimize memory power and performance because these
                 effects depend upon activity across multiple layers of
                 the vertical execution stack. To address this
                 challenge, we construct a novel and collaborative
                 framework that employs object placement, cross-layer
                 communication, and page-level management to effectively
                 distribute application objects in the DRAM hardware to
                 achieve desired power/performance goals. In this work,
                 we describe the design and implementation of our
                 framework, which is the first to integrate automatic
                 object profiling and analysis at the application layer
                 with fine-grained management of memory hardware
                 resources in the operating system. We demonstrate the
                 utility of our framework by employing it to more
                 effectively control memory power consumption. We design
                 a custom memory-intensive workload to show the
                 potential of our approach. Next, we develop sampling
                 and profiling-based analyses and modify the code
                 generator in the HotSpot VM to understand object usage
                 patterns and automatically determine and control the
                 placement of hot and cold objects in a partitioned VM
                 heap. This information is communicated to the operating
                 system, which uses it to map the logical application
                 pages to the appropriate DRAM ranks according to
                 user-defined provisioning goals. We evaluate our
                 framework and find that it achieves our test goal of
                 significant DRAM energy savings across a variety of
                 workloads, without any source code modifications or
                 recompilations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Madsen:2015:SAE,
  author =       "Magnus Madsen and Frank Tip and Ondrej Lhot{\'a}k",
  title =        "Static analysis of event-driven {Node.js JavaScript}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "505--519",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814272",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many JavaScript programs are written in an
                 event-driven style. In particular, in server-side
                 Node.js applications, operations involving sockets,
                 streams, and files are typically performed in an
                 asynchronous manner, where the execution of listeners
                 is triggered by events. Several types of programming
                 errors are specific to such event-based programs (e.g.,
                 unhandled events, and listeners that are registered too
                 late). We present the event-based call graph, a program
                 representation that can be used to detect bugs related
                 to event handling. We have designed and implemented
                 three analyses for constructing event-based call
                 graphs. Our results show that these analyses are
                 capable of detecting problems reported on
                 StackOverflow. Moreover, we show that the number of
                 false positives reported by the analysis on a suite of
                 small Node.js applications is manageable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Feng:2015:EQD,
  author =       "Yu Feng and Xinyu Wang and Isil Dillig and Calvin
                 Lin",
  title =        "{EXPLORER} : query- and demand-driven exploration of
                 interprocedural control flow properties",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "520--534",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814284",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes a general framework and its
                 implementation in a tool called EXPLORER for statically
                 answering a class of interprocedural control flow
                 queries about Java programs. EXPLORER allows users to
                 formulate queries about feasible callstack
                 configurations using regular expressions, and it
                 employs a precise, demand-driven algorithm for
                 answering such queries. Specifically, EXPLORER
                 constructs an automaton A that is iteratively refined
                 until either the language accepted by A is empty
                 (meaning that the query has been refuted) or until no
                 further refinement is possible based on a precise,
                 context-sensitive abstraction of the program. We
                 evaluate EXPLORER by applying it to three different
                 program analysis tasks, namely, (1) analysis of the
                 observer design pattern in Java, (2) identification of
                 a class of performance bugs, and (3) analysis of
                 inter-component communication in Android applications.
                 Our evaluation shows that EXPLORER is both efficient
                 and precise.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Dietrich:2015:GSE,
  author =       "Jens Dietrich and Nicholas Hollingum and Bernhard
                 Scholz",
  title =        "Giga-scale exhaustive points-to analysis for {Java} in
                 under a minute",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "535--551",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814307",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computing a precise points-to analysis for very large
                 Java programs remains challenging despite the large
                 body of research on points-to analysis. Any approach
                 must solve an underlying dynamic graph reachability
                 problem, for which the best algorithms have near-cubic
                 worst-case runtime complexity, and, hence, previous
                 work does not scale to programs with millions of lines
                 of code. In this work, we present a novel approach for
                 solving the field-sensitive points-to problem for Java
                 with the means of (1) a transitive-closure
                 data-structure, and (2) a pre-computed set of
                 potentially matching load/store pairs to accelerate the
                 fix-point calculation. Experimentation on Java
                 benchmarks validates the superior performance of our
                 approach over the standard context-free language
                 reachability implementations. Our approach computes a
                 points-to index for the OpenJDK with over 1.5 billion
                 tuples in under a minute.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Darais:2015:GTM,
  author =       "David Darais and Matthew Might and David {Van Horn}",
  title =        "{Galois} transformers and modular abstract
                 interpreters: reusable metatheory for program
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "552--571",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814308",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The design and implementation of static analyzers has
                 become increasingly systematic. Yet for a given
                 language or analysis feature, it often requires tedious
                 and error prone work to implement an analyzer and prove
                 it sound. In short, static analysis features and their
                 proofs of soundness do not compose well, causing a
                 dearth of reuse in both implementation and metatheory.
                 We solve the problem of systematically constructing
                 static analyzers by introducing Galois transformers:
                 monad transformers that transport Galois connection
                 properties. In concert with a monadic interpreter, we
                 define a library of monad transformers that implement
                 building blocks for classic analysis parameters like
                 context, path, and heap (in)sensitivity. Moreover,
                 these can be composed together independent of the
                 language being analyzed. Significantly, a Galois
                 transformer can be proved sound once and for all,
                 making it a reusable analysis component. As new
                 analysis features and abstractions are developed and
                 mixed in, soundness proofs need not be reconstructed,
                 as the composition of a monad transformer stack is
                 sound by virtue of its constituents. Galois
                 transformers provide a viable foundation for reusable
                 and composable metatheory for program analysis.
                 Finally, these Galois transformers shift the level of
                 abstraction in analysis design and implementation to a
                 level where non-specialists have the ability to
                 synthesize sound analyzers over a number of
                 parameters.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Oh:2015:LSA,
  author =       "Hakjoo Oh and Hongseok Yang and Kwangkeun Yi",
  title =        "Learning a strategy for adapting a program analysis
                 via {Bayesian} optimisation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "572--588",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814309",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Building a cost-effective static analyser for
                 real-world programs is still regarded an art. One key
                 contributor to this grim reputation is the difficulty
                 in balancing the cost and the precision of an analyser.
                 An ideal analyser should be adaptive to a given
                 analysis task, and avoid using techniques that
                 unnecessarily improve precision and increase analysis
                 cost. However, achieving this ideal is highly
                 nontrivial, and it requires a large amount of
                 engineering efforts. In this paper we present a new
                 approach for building an adaptive static analyser. In
                 our approach, the analyser includes a sophisticated
                 parameterised strategy that decides, for each part of a
                 given program, whether to apply a precision-improving
                 technique to that part or not. We present a method for
                 learning a good parameter for such a strategy from an
                 existing codebase via Bayesian optimisation. The learnt
                 strategy is then used for new, unseen programs. Using
                 our approach, we developed partially flow- and
                 context-sensitive variants of a realistic C static
                 analyser. The experimental results demonstrate that
                 using Bayesian optimisation is crucial for learning
                 from an existing codebase. Also, they show that among
                 all program queries that require flow- or
                 context-sensitivity, our partially flow- and
                 context-sensitive analysis answers the 75\% of them,
                 while increasing the analysis cost only by 3.3x of the
                 baseline flow- and context-insensitive analysis, rather
                 than 40x or more of the fully sensitive version.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Alves:2015:RPD,
  author =       "P{\'e}ricles Alves and Fabian Gruber and Johannes
                 Doerfert and Alexandros Lamprineas and Tobias Grosser
                 and Fabrice Rastello and Fernando Magno Quint{\~a}o
                 Pereira",
  title =        "Runtime pointer disambiguation",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "589--606",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814285",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To optimize code effectively, compilers must deal with
                 memory dependencies. However, the state-of-the-art
                 heuristics available in the literature to track memory
                 dependencies are inherently imprecise and
                 computationally expensive. Consequently, the most
                 advanced code transformations that compilers have today
                 are ineffective when applied on real-world programs.
                 The goal of this paper is to solve this conundrum
                 through dynamic disambiguation of pointers. We provide
                 different ways to determine at runtime when two memory
                 locations can overlap. We then produce two versions of
                 a code region: one that is aliasing-free --- hence,
                 easy to optimize --- and another that is not. Our
                 checks let us safely branch to the optimizable region.
                 We have applied these ideas on Polly-LLVM, a loop
                 optimizer built on top of the LLVM compilation
                 infrastructure. Our experiments indicate that our
                 method is precise, effective and useful: we can
                 disambiguate every pair of pointer in the loop
                 intensive Polybench benchmark suite. The result of this
                 precision is code quality: the binaries we generate are
                 10\% faster than those that Polly-LLVM produces without
                 our optimization, at the {\tt -O3} optimization level
                 of LLVM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Toffola:2015:PPY,
  author =       "Luca Della Toffola and Michael Pradel and Thomas R.
                 Gross",
  title =        "Performance problems you can fix: a dynamic analysis
                 of memoization opportunities",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "607--622",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814290",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Performance bugs are a prevalent problem and recent
                 research proposes various techniques to identify such
                 bugs. This paper addresses a kind of performance
                 problem that often is easy to address but difficult to
                 identify: redundant computations that may be avoided by
                 reusing already computed results for particular inputs,
                 a technique called memoization. To help developers find
                 and use memoization opportunities, we present
                 MemoizeIt, a dynamic analysis that identifies methods
                 that repeatedly perform the same computation. The key
                 idea is to compare inputs and outputs of method calls
                 in a scalable yet precise way. To avoid the overhead of
                 comparing objects at all method invocations in detail,
                 MemoizeIt first compares objects without following any
                 references and iteratively increases the depth of
                 exploration while shrinking the set of considered
                 methods. After each iteration, the approach ignores
                 methods that cannot benefit from memoization, allowing
                 it to analyze calls to the remaining methods in more
                 detail. For every memoization opportunity that
                 MemoizeIt detects, it provides hints on how to
                 implement memoization, making it easy for the developer
                 to fix the performance issue. Applying MemoizeIt to
                 eleven real-world Java programs reveals nine profitable
                 memoization opportunities, most of which are missed by
                 traditional CPU time profilers, conservative compiler
                 optimizations, and other existing approaches for
                 finding performance bugs. Adding memoization as
                 proposed by MemoizeIt leads to statistically
                 significant speedups by factors between 1.04x and
                 12.93x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Lee:2015:RRA,
  author =       "Wen-Chuan Lee and Tao Bao and Yunhui Zheng and Xiangyu
                 Zhang and Keval Vora and Rajiv Gupta",
  title =        "{RAIVE}: runtime assessment of floating-point
                 instability by vectorization",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "623--638",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814299",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Floating point representation has limited precision
                 and inputs to floating point programs may also have
                 errors. Consequently, during execution, errors are
                 introduced, propagated, and accumulated, leading to
                 unreliable outputs. We call this the instability
                 problem. We propose RAIVE, a technique that identifies
                 output variations of a floating point execution in the
                 presence of instability. RAIVE transforms every
                 floating point value to a vector of multiple values ---
                 the values added to create the vector are obtained by
                 introducing artificial errors that are upper bounds of
                 actual errors. The propagation of artificial errors
                 models the propagation of actual errors. When values in
                 vectors result in discrete execution differences (e.g.,
                 following different paths), the execution is forked to
                 capture the resulting output variations. Our evaluation
                 shows that RAIVE can precisely capture output
                 variations. Its overhead (340\%) is 2.43 times lower
                 than the state of the art",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Fu:2015:ABE,
  author =       "Zhoulai Fu and Zhaojun Bai and Zhendong Su",
  title =        "Automated backward error analysis for numerical code",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "639--654",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814317",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Numerical code uses floating-point arithmetic and
                 necessarily suffers from roundoff and truncation
                 errors. Error analysis is the process to quantify such
                 uncertainty in the solution to a problem. Forward error
                 analysis and backward error analysis are two popular
                 paradigms of error analysis. Forward error analysis is
                 more intuitive and has been explored and automated by
                 the programming languages (PL) community. In contrast,
                 although backward error analysis is more preferred by
                 numerical analysts and the foundation for numerical
                 stability, it is less known and unexplored by the PL
                 community. To fill the gap, this paper presents an
                 automated backward error analysis for numerical code to
                 empower both numerical analysts and application
                 developers. In addition, we use the computed backward
                 error results to also compute the condition number, an
                 important quantity recognized by numerical analysts for
                 measuring how sensitive a function is to changes or
                 errors in the input. Experimental results on Intel X87
                 FPU functions and widely-used GNU C Library functions
                 demonstrate that our analysis is effective at analyzing
                 the accuracy of floating-point programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Voelter:2015:UCL,
  author =       "Markus Voelter and Arie van Deursen and Bernd Kolb and
                 Stephan Eberle",
  title =        "Using C {language} extensions for developing embedded
                 software: a case study",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "655--674",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814276",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We report on an industrial case study on developing
                 the embedded software for a smart meter using the C
                 programming language and domain-specific extensions of
                 C such as components, physical units, state machines,
                 registers and interrupts. We find that the extensions
                 help significantly with managing the complexity of the
                 software. They improve testability mainly by supporting
                 hardware-independent testing, as illustrated by low
                 integration efforts. The extensions also do not incur
                 significant overhead regarding memory consumption and
                 performance. Our case study relies on mbeddr, an
                 extensible version of C. mbeddr, in turn, builds on the
                 MPS language workbench which supports modular extension
                 of languages and IDEs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Lopes:2015:HSA,
  author =       "Cristina V. Lopes and Joel Ossher",
  title =        "How scale affects structure in {Java} programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "675--694",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814300",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many internal software metrics and external quality
                 attributes of Java programs correlate strongly with
                 program size. This knowledge has been used pervasively
                 in quantitative studies of software through practices
                 such as normalization on size metrics. This paper
                 reports size-related super- and sublinear effects that
                 have not been known before. Findings obtained on a very
                 large collection of Java programs --- 30,911 projects
                 hosted at Google Code as of Summer 2011 --- unveils how
                 certain characteristics of programs vary
                 disproportionately with program size, sometimes even
                 non-monotonically. Many of the specific parameters of
                 nonlinear relations are reported. This result gives
                 further insights for the differences of ``programming
                 in the small'' vs. ``programming in the large.'' The
                 reported findings carry important consequences for OO
                 software metrics, and software research in general:
                 metrics that have been known to correlate with size can
                 now be properly normalized so that all the information
                 that is left in them is size-independent.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Mastrangelo:2015:UYO,
  author =       "Luis Mastrangelo and Luca Ponzanelli and Andrea Mocci
                 and Michele Lanza and Matthias Hauswirth and Nathaniel
                 Nystrom",
  title =        "Use at your own risk: the {Java} unsafe {API} in the
                 wild",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "695--710",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814313",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Java is a safe language. Its runtime environment
                 provides strong safety guarantees that any Java
                 application can rely on. Or so we think. We show that
                 the runtime actually does not provide these
                 guarantees---for a large fraction of today's Java code.
                 Unbeknownst to many application developers, the Java
                 runtime includes a ``backdoor'' that allows expert
                 library and framework developers to circumvent Java's
                 safety guarantees. This backdoor is there by design,
                 and is well known to experts, as it enables them to
                 write high-performance ``systems-level'' code in Java.
                 For much the same reasons that safe languages are
                 preferred over unsafe languages, these powerful---but
                 unsafe---capabilities in Java should be restricted.
                 They should be made safe by changing the language, the
                 runtime system, or the libraries. At the very least,
                 their use should be restricted. This paper is a step in
                 that direction. We analyzed 74 GB of compiled Java
                 code, spread over 86,479 Java archives, to determine
                 how Java's unsafe capabilities are used in real-world
                 libraries and applications. We found that 25\% of Java
                 bytecode archives depend on unsafe third-party Java
                 code, and thus Java's safety guarantees cannot be
                 trusted. We identify 14 different usage patterns of
                 Java's unsafe capabilities, and we provide supporting
                 evidence for why real-world code needs these
                 capabilities. Our long-term goal is to provide a
                 foundation for the design of new language features to
                 regain safety in Java.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Achour:2015:ACO,
  author =       "Sara Achour and Martin C. Rinard",
  title =        "Approximate computation with outlier detection in
                 {Topaz}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "711--730",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814314",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Topaz, a new task-based language for
                 computations that execute on approximate computing
                 platforms that may occasionally produce arbitrarily
                 inaccurate results. Topaz maps tasks onto the
                 approximate hardware and integrates the generated
                 results into the main computation. To prevent
                 unacceptably inaccurate task results from corrupting
                 the main computation, Topaz deploys a novel outlier
                 detection mechanism that recognizes and precisely
                 reexecutes outlier tasks. Outlier detection enables
                 Topaz to work effectively with approximate hardware
                 platforms that have complex fault characteristics,
                 including platforms with bit pattern dependent faults
                 (in which the presence of faults may depend on values
                 stored in adjacent memory cells). Our experimental
                 results show that, for our set of benchmark
                 applications, outlier detection enables Topaz to
                 deliver acceptably accurate results (less than 1\%
                 error) on our target approximate hardware platforms.
                 Depending on the application and the hardware platform,
                 the overall energy savings range from 5 to 13 percent.
                 Without outlier detection, only one of the applications
                 produces acceptably accurate results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Wickerson:2015:RSP,
  author =       "John Wickerson and Mark Batty and Bradford M. Beckmann
                 and Alastair F. Donaldson",
  title =        "Remote-scope promotion: clarified, rectified, and
                 verified",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "731--747",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814283",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern accelerator programming frameworks, such as
                 OpenCL, organise threads into work-groups. Remote-scope
                 promotion (RSP) is a language extension recently
                 proposed by AMD researchers that is designed to enable
                 applications, for the first time, both to optimise for
                 the common case of intra-work-group communication
                 (using memory scopes to provide consistency only within
                 a work-group) and to allow occasional inter-work-group
                 communication (as required, for instance, to support
                 the popular load-balancing idiom of work stealing). We
                 present the first formal, axiomatic memory model of
                 OpenCL extended with RSP. We have extended the Herd
                 memory model simulator with support for OpenCL kernels
                 that exploit RSP, and used it to discover bugs in
                 several litmus tests and a work-stealing queue, that
                 have been used previously in the study of RSP. We have
                 also formalised the proposed GPU implementation of RSP.
                 The formalisation process allowed us to identify bugs
                 in the description of RSP that could result in
                 well-synchronised programs experiencing memory
                 inconsistencies. We present and prove sound a new
                 implementation of RSP that incorporates bug fixes and
                 requires less non-standard hardware than the original
                 implementation. This work, a collaboration between
                 academia and industry, clearly demonstrates how, when
                 designing hardware support for a new concurrent
                 language feature, the early application of formal tools
                 and techniques can help to prevent errors, such as
                 those we have found, from making it into silicon.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Hammer:2015:ICN,
  author =       "Matthew A. Hammer and Joshua Dunfield and Kyle Headley
                 and Nicholas Labich and Jeffrey S. Foster and Michael
                 Hicks and David {Van Horn}",
  title =        "Incremental computation with names",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "748--766",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814305",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the past thirty years, there has been significant
                 progress in developing general-purpose, language-based
                 approaches to incremental computation, which aims to
                 efficiently update the result of a computation when an
                 input is changed. A key design challenge in such
                 approaches is how to provide efficient incremental
                 support for a broad range of programs. In this paper,
                 we argue that first-class names are a critical
                 linguistic feature for efficient incremental
                 computation. Names identify computations to be reused
                 across differing runs of a program, and making them
                 first class gives programmers a high level of control
                 over reuse. We demonstrate the benefits of names by
                 presenting Nominal Adapton, an ML-like language for
                 incremental computation with names. We describe how to
                 use Nominal Adapton to efficiently incrementalize
                 several standard programming patterns---including maps,
                 folds, and unfolds---and show how to build efficient,
                 incremental probabilistic trees and tries. Since
                 Nominal Adapton's implementation is subtle, we
                 formalize it as a core calculus and prove it is
                 from-scratch consistent, meaning it always produces the
                 same answer as simply re-running the computation.
                 Finally, we demonstrate that Nominal Adapton can
                 provide large speedups over both from-scratch
                 computation and Adapton, a previous state-of-the-art
                 incremental computation system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Felgentreff:2015:CBC,
  author =       "Tim Felgentreff and Todd Millstein and Alan Borning
                 and Robert Hirschfeld",
  title =        "Checks and balances: constraint solving without
                 surprises in object-constraint programming languages",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "767--782",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814311",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Object-constraint programming systems integrate
                 declarative constraint solving with imperative,
                 object-oriented languages, seamlessly providing the
                 power of both paradigms. However, experience with
                 object-constraint systems has shown that giving too
                 much power to the constraint solver opens up the
                 potential for solutions that are surprising and
                 unintended as well as for complex interactions between
                 constraints and imperative code. On the other hand,
                 systems that overly limit the power of the solver, for
                 example by disallowing constraints involving mutable
                 objects, object identity, or polymorphic message sends,
                 run the risk of excluding the core object-oriented
                 features of the language from the constraint part, and
                 consequently not being able to express declaratively a
                 large set of interesting problem solutions. In this
                 paper we present design principles that tame the power
                 of the constraint solver in object-constraint languages
                 to avoid difficult corner cases and surprising
                 solutions while retaining the key features of the
                 approach, including constraints over mutable objects,
                 constraints involving object identity, and constraints
                 on the results of message sends. We present our
                 solution concretely in the context of the Babelsberg
                 object-constraint language framework, providing both an
                 informal description of the resulting language and a
                 formal semantics for a core subset of it. We validate
                 the utility of this semantics with an executable
                 version that allows us to run test programs and to
                 verify that they provide the same results as existing
                 implementations of Babelsberg in JavaScript, Ruby, and
                 Smalltalk.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Steindorfer:2015:OHA,
  author =       "Michael J. Steindorfer and Jurgen J. Vinju",
  title =        "Optimizing hash-array mapped tries for fast and lean
                 immutable {JVM} collections",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "783--800",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814312",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The data structures under-pinning collection API (e.g.
                 lists, sets, maps) in the standard libraries of
                 programming languages are used intensively in many
                 applications. The standard libraries of recent Java
                 Virtual Machine languages, such as Clojure or Scala,
                 contain scalable and well-performing immutable
                 collection data structures that are implemented as
                 Hash-Array Mapped Tries (HAMTs). HAMTs already feature
                 efficient lookup, insert, and delete operations,
                 however due to their tree-based nature their memory
                 footprints and the runtime performance of iteration and
                 equality checking lag behind array-based counterparts.
                 This particularly prohibits their application in
                 programs which process larger data sets. In this paper,
                 we propose changes to the HAMT design that increase the
                 overall performance of immutable sets and maps. The
                 resulting general purpose design increases cache
                 locality and features a canonical representation. It
                 outperforms Scala's and Clojure's data structure
                 implementations in terms of memory footprint and
                 runtime efficiency of iteration (1.3-6.7x) and equality
                 checking (3-25.4x).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Ureche:2015:AAH,
  author =       "Vlad Ureche and Aggelos Biboudis and Yannis
                 Smaragdakis and Martin Odersky",
  title =        "Automating ad hoc data representation
                 transformations",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "801--820",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814271",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To maximize run-time performance, programmers often
                 specialize their code by hand, replacing library
                 collections and containers by custom objects in which
                 data is restructured for efficient access. However,
                 changing the data representation is a tedious and
                 error-prone process that makes it hard to test,
                 maintain and evolve the source code. We present an
                 automated and composable mechanism that allows
                 programmers to safely change the data representation in
                 delimited scopes containing anything from expressions
                 to entire class definitions. To achieve this,
                 programmers define a transformation and our mechanism
                 automatically and transparently applies it during
                 compilation, eliminating the need to manually change
                 the source code. Our technique leverages the type
                 system in order to offer correctness guarantees on the
                 transformation and its interaction with object-oriented
                 language features, such as dynamic dispatch,
                 inheritance and generics. We have embedded this
                 technique in a Scala compiler plugin and used it in
                 four very different transformations, ranging from
                 improving the data layout and encoding, to retrofitting
                 specialization and value class status, and all the way
                 to collection deforestation. On our benchmarks, the
                 technique obtained speedups between 1.8x and 24.5x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Marr:2015:TVP,
  author =       "Stefan Marr and St{\'e}phane Ducasse",
  title =        "Tracing vs. partial evaluation: comparing
                 meta-compilation approaches for self-optimizing
                 interpreters",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "821--839",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814275",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Tracing and partial evaluation have been proposed as
                 meta-compilation techniques for interpreters to make
                 just-in-time compilation language-independent. They
                 promise that programs executing on simple interpreters
                 can reach performance of the same order of magnitude as
                 if they would be executed on state-of-the-art virtual
                 machines with highly optimizing just-in-time compilers
                 built for a specific language. Tracing and partial
                 evaluation approach this meta-compilation from two ends
                 of a spectrum, resulting in different sets of
                 tradeoffs. This study investigates both approaches in
                 the context of self-optimizing interpreters, a
                 technique for building fast abstract-syntax-tree
                 interpreters. Based on RPython for tracing and Truffle
                 for partial evaluation, we assess the two approaches by
                 comparing the impact of various optimizations on the
                 performance of an interpreter for SOM, an
                 object-oriented dynamically-typed language. The goal is
                 to determine whether either approach yields clear
                 performance or engineering benefits. We find that
                 tracing and partial evaluation both reach roughly the
                 same level of performance. SOM based on meta-tracing is
                 on average 3x slower than Java, while SOM based on
                 partial evaluation is on average 2.3x slower than Java.
                 With respect to the engineering, tracing has however
                 significant benefits, because it requires language
                 implementers to apply fewer optimizations to reach the
                 same level of performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Upadhyaya:2015:EML,
  author =       "Ganesha Upadhyaya and Hridesh Rajan",
  title =        "Effectively mapping linguistic abstractions for
                 message-passing concurrency to threads on the {Java
                 Virtual Machine}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "840--859",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814289",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Efficient mapping of message passing concurrency (MPC)
                 abstractions to Java Virtual Machine (JVM) threads is
                 critical for performance, scalability, and CPU
                 utilization; but tedious and time consuming to perform
                 manually. In general, this mapping cannot be found in
                 polynomial time, but we show that by exploiting the
                 local characteristics of MPC abstractions and their
                 communication patterns this mapping can be determined
                 effectively. We describe our MPC abstraction to thread
                 mapping technique, its realization in two frameworks
                 (Panini and Akka), and its rigorous evaluation using
                 several benchmarks from representative MPC frameworks.
                 We also compare our technique against four default
                 mapping techniques: thread-all, round-robin-task-all,
                 random-task-all and work-stealing. Our evaluation shows
                 that our mapping technique can improve the performance
                 by 30\%-60\% over default mapping techniques. These
                 improvements are due to a number of challenges
                 addressed by our technique namely: (i) balancing the
                 computations across JVM threads, (ii) reducing the
                 communication overheads, (iii) utilizing information
                 about cache locality, and (iv) mapping MPC abstractions
                 to threads in a way that reduces the contention between
                 JVM threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Srinivasan:2015:PEM,
  author =       "Venkatesh Srinivasan and Thomas Reps",
  title =        "Partial evaluation of machine code",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "860--879",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814321",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents an algorithm for off-line partial
                 evaluation of machine code. The algorithm follows the
                 classical two-phase approach of binding-time analysis
                 (BTA) followed by specialization. However, machine-code
                 partial evaluation presents a number of new challenges,
                 and it was necessary to devise new techniques for use
                 in each phase. --- Our BTA algorithm makes use of an
                 instruction-rewriting method that ``decouples''
                 multiple updates performed by a single instruction.
                 This method counters the cascading imprecision that
                 would otherwise occur with a more naive approach to
                 BTA. --- Our specializer specializes an explicit
                 representation of the semantics of an instruction, and
                 emits residual code via machine-code synthesis.
                 Moreover, to create code that allows the stack and heap
                 to be at different positions at run-time than at
                 specialization-time, the specializer represents
                 specialization-time addresses using symbolic constants,
                 and uses a symbolic state for specialization. Our
                 experiments show that our algorithm can be used to
                 specialize binaries with respect to commonly used
                 inputs to produce faster binaries, as well as to
                 extract an executable component from a bloated
                 binary.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Erdweg:2015:CCF,
  author =       "Sebastian Erdweg and Oliver Bracevac and Edlira Kuci
                 and Matthias Krebs and Mira Mezini",
  title =        "A co-contextual formulation of type rules and its
                 application to incremental type checking",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "880--897",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814277",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Type rules associate types to expressions given a
                 typing context. As the type checker traverses the
                 expression tree top-down, it extends the typing context
                 with additional context information that becomes
                 available. This way, the typing context coordinates
                 type checking in otherwise independent subexpressions,
                 which inhibits parallelization and incrementalization
                 of type checking. We propose a co-contextual
                 formulation of type rules that only take an expression
                 as input and produce a type and a set of context
                 requirements. Co-contextual type checkers traverse an
                 expression tree bottom-up and merge context
                 requirements of independently checked subexpressions.
                 We describe a method for systematically constructing a
                 co-contextual formulation of type rules from a regular
                 context-based formulation and we show how co-contextual
                 type rules give rise to incremental type checking.
                 Using our method, we derive incremental type checkers
                 for PCF and for extensions that introduce records,
                 parametric polymorphism, and subtyping. Our performance
                 evaluation shows that co-contextual type checking has
                 performance comparable to standard context-based type
                 checking, and incrementalization can improve
                 performance significantly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Brandauer:2015:DDF,
  author =       "Stephan Brandauer and Dave Clarke and Tobias
                 Wrigstad",
  title =        "Disjointness domains for fine-grained aliasing",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "898--916",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814280",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Aliasing is crucial for supporting useful
                 implementation patterns, but it makes reasoning about
                 programs difficult. To deal with this problem, numerous
                 type-based aliasing control mechanisms have been
                 proposed, expressing properties such as uniqueness.
                 Uniqueness, however, is black-and-white: either a
                 reference is unique or it can be arbitrarily aliased;
                 and global: excluding aliases throughout the entire
                 system, making code brittle to changing requirements.
                 Disjointness domains, a new approach to alias control,
                 address this problem by enabling more graduations
                 between uniqueness and arbitrary reference sharing.
                 They allow expressing aliasing constraints local to a
                 certain set of variables (either stack variables or
                 fields) for instance that no aliasing occurs between
                 variables within some set of variables but between such
                 sets or the opposite, that aliasing occurs within that
                 set but not between different sets. A hierarchy of
                 disjointness domains controls the flow of references
                 through a program, helping the programmer reason about
                 disjointness and enforce local alias invariants. The
                 resulting system supports fine-grained control of
                 aliasing between both variables and objects, making
                 aliasing explicit to programmers, compilers, and
                 tooling. This paper presents a formal account of
                 disjointness domains along with examples. Disjointness
                 domains provide novel means of expressing may-alias
                 kinds of constraints, which may prove useful in
                 compiler optimisation and verification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Crafa:2015:CAT,
  author =       "Silvia Crafa and Luca Padovani",
  title =        "The chemical approach to typestate-oriented
                 programming",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "917--934",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814287",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study a novel approach to typestate-oriented
                 programming based on the chemical metaphor: state and
                 operations on objects are molecules of messages and
                 state transformations are chemical reactions. This
                 approach allows us to investigate typestate in an
                 inherently concurrent setting, whereby objects can be
                 accessed and modified concurrently by several
                 processes, each potentially changing only part of their
                 state. We introduce a simple behavioral type theory to
                 express in a uniform way both the private and the
                 public interfaces of objects, to describe and enforce
                 structured object protocols consisting of
                 possibilities, prohibitions, and obligations, and to
                 control object sharing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Toro:2015:CGP,
  author =       "Mat{\'\i}as Toro and {\'E}ric Tanter",
  title =        "Customizable gradual polymorphic effects for {Scala}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "10",
  pages =        "935--953",
  month =        oct,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2858965.2814315",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:43 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite their obvious advantages in terms of static
                 reasoning, the adoption of effect systems is still
                 rather limited in practice. Recent advances such as
                 generic effect systems, lightweight effect
                 polymorphism, and gradual effect checking, all
                 represent promising steps towards making effect systems
                 suitable for widespread use. However, no existing
                 system combines these approaches: the theory of gradual
                 polymorphic effects has not been developed, and there
                 are no implementations of gradual effect checking. In
                 addition, a limiting factor in the adoption of effect
                 systems is their unsuitability for localized and
                 customized effect disciplines. This paper addresses
                 these issues by presenting the first implementation of
                 gradual effect checking, for Scala, which supports both
                 effect polymorphism and a domain-specific language
                 called Effscript to declaratively define and customize
                 effect disciplines. We report on the theory,
                 implementation, and practical application of the
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '15 conference proceedings.",
}

@Article{Kim:2015:CPM,
  author =       "Sang-Hoon Kim and Sejun Kwon and Jin-Soo Kim and
                 Jinkyu Jeong",
  title =        "Controlling physical memory fragmentation in mobile
                 systems",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "1--14",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754179",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Since the adoption of hardware-accelerated features
                 (e.g., hardware codec) improves the performance and
                 quality of mobile devices, it revives the need for
                 contiguous memory allocation. However, physical memory
                 in mobile systems is highly fragmented due to the
                 frequent spawn and exit of processes and the lack of
                 proactive anti-fragmentation scheme. As a result, the
                 memory allocation for large and contiguous I/O buffers
                 suffer from the highly fragmented memory, thereby
                 incurring high CPU usage and power consumption. This
                 paper presents a proactive anti-fragmentation approach
                 that groups pages with the same lifetime, and stores
                 them contiguously in fixed-size contiguous regions.
                 When a process is killed to secure free memory, a set
                 of contiguous regions are freed and subsequent
                 contiguous memory allocations can be easily satisfied
                 without incurring additional overhead. Our prototype
                 implementation on a Nexus 10 tablet with the Android
                 kernel shows that the proposed scheme greatly
                 alleviates fragmentation, thereby reducing the I/O
                 buffer allocation time, associated CPU usage, and
                 energy consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Hussein:2015:DRM,
  author =       "Ahmed Hussein and Antony L. Hosking and Mathias Payer
                 and Christopher A. Vick",
  title =        "Don't race the memory bus: taming the {GC} leadfoot",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "15--27",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754182",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic voltage and frequency scaling (DVFS) is
                 ubiquitous on mobile devices as a mechanism for saving
                 energy. Reducing the clock frequency of a processor
                 allows a corresponding reduction in power consumption,
                 as does turning off idle cores. Garbage collection is a
                 canonical example of the sort of memory-bound workload
                 that best responds to such scaling. Here, we explore
                 the impact of frequency scaling for garbage collection
                 in a real mobile device running Android's Dalvik
                 virtual machine, which uses a concurrent collector. By
                 controlling the frequency of the core on which the
                 concurrent collector thread runs we can reduce power
                 significantly. Running established multi-threaded
                 benchmarks shows that total processor energy can be
                 reduced up to 30\%, with end-to-end performance loss of
                 at most 10\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Cohen:2015:DSA,
  author =       "Nachshon Cohen and Erez Petrank",
  title =        "Data structure aware garbage collector",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "28--40",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754176",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Garbage collection may benefit greatly from knowledge
                 about program behavior, but most managed languages do
                 not provide means for the programmer to deliver such
                 knowledge. In this work we propose a very simple
                 interface that requires minor programmer effort and
                 achieves substantial performance and scalability
                 improvements. In particular, we focus on the common use
                 of data structures or collections for organizing data
                 on the heap. We let the program notify the collector
                 which classes represent nodes of data structures and
                 also when such nodes are being removed from their data
                 structures. The data-structure aware (DSA) garbage
                 collector uses this information to improve performance,
                 locality, and load balancing. Experience shows that
                 this interface requires a minor modification of the
                 application. Measurements show that for some
                 significant benchmarks this interface can dramatically
                 reduce the time spent on garbage collection and also
                 improve the overall program performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Kuszmaul:2015:SSF,
  author =       "Bradley C. Kuszmaul",
  title =        "{SuperMalloc}: a super fast multithreaded {\tt malloc}
                 for 64-bit machines",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "41--55",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754178",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "SuperMalloc is an implementation of malloc(3)
                 originally designed for X86 Hardware Transactional
                 Memory (HTM)@. It turns out that the same design
                 decisions also make it fast even without HTM@. For the
                 malloc-test benchmark, which is one of the most
                 difficult workloads for an allocator, with one thread
                 SuperMalloc is about 2.1 times faster than the best of
                 DLmalloc, JEmalloc, Hoard, and TBBmalloc; with 8
                 threads and HTM, SuperMalloc is 2.75 times faster; and
                 on 32 threads without HTM SuperMalloc is 3.4 times
                 faster. SuperMalloc generally compares favorably with
                 the other allocators on speed, scalability, speed
                 variance, memory footprint, and code size. SuperMalloc
                 achieves these performance advantages using less than
                 half as much code as the alternatives. SuperMalloc
                 exploits the fact that although physical memory is
                 always precious, virtual address space on a 64-bit
                 machine is relatively cheap. It allocates 2 chunks
                 which contain objects all the same size. To translate
                 chunk numbers to chunk metadata, SuperMalloc uses a
                 simple array (most of which is uncommitted to physical
                 memory). SuperMalloc takes care to avoid associativity
                 conflicts in the cache: most of the size classes are a
                 prime number of cache lines, and nonaligned huge
                 accesses are randomly aligned within a page. Objects
                 are allocated from the fullest non-full page in the
                 appropriate size class. For each size class,
                 SuperMalloc employs a 10-object per-thread cache, a
                 per-CPU cache that holds about a level-2-cache worth of
                 objects per size class, and a global cache that is
                 organized to allow the movement of many objects between
                 a per-CPU cache and the global cache using $ O(1) $
                 instructions. SuperMalloc prefetches everything it can
                 before starting a critical section, which makes the
                 critical sections run fast, and for HTM improves the
                 odds that the transaction will commit.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Osterlund:2015:CCU,
  author =       "Erik {\"O}sterlund and Welf L{\"o}we",
  title =        "Concurrent compaction using a field pinning protocol",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "56--69",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754177",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compaction of memory in long running systems has
                 always been important. The latency of compaction
                 increases in today's systems with high memory demands
                 and large heaps. To deal with this problem, we present
                 a lock-free protocol allowing for copying concurrent
                 with the application running, which reduces the
                 latencies of compaction radically. It provides
                 theoretical progress guarantees for copying and
                 application threads without making it practically
                 infeasible, with performance overheads of 15\% on
                 average. The algorithm paves the way for a future
                 lock-free Garbage Collector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Lin:2015:SGU,
  author =       "Yi Lin and Kunshan Wang and Stephen M. Blackburn and
                 Antony L. Hosking and Michael Norrish",
  title =        "Stop and go: understanding yieldpoint behavior",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "70--80",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754187",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Yieldpoints are critical to the implementation of high
                 performance garbage collected languages, yet the design
                 space is not well understood. Yieldpoints allow a
                 running program to be interrupted at well-defined
                 points in its execution, facilitating exact garbage
                 collection, biased locking, on-stack replacement,
                 profiling, and other important virtual machine
                 behaviors. In this paper we identify and evaluate
                 yieldpoint design choices, including previously
                 undocumented designs and optimizations. One of the
                 designs we identify opens new opportunities for very
                 low overhead profiling. We measure the frequency with
                 which yieldpoints are executed and establish a
                 methodology for evaluating the common case execution
                 time overhead. We also measure the median and worst
                 case time-to-yield. We find that Java benchmarks
                 execute about 100M yieldpoints per second, of which
                 about 1/20000 are taken. The average execution time
                 overhead for untaken yieldpoints on the VM we use
                 ranges from 2.5\% to close to zero on modern hardware,
                 depending on the design, and we find that the designs
                 trade off total overhead with worst case time-to-yield.
                 This analysis gives new insight into a critical but
                 overlooked aspect of garbage collector implementation,
                 and identifies a new optimization and new opportunities
                 for very low overhead profiling.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Stancu:2015:SEH,
  author =       "Codrut Stancu and Christian Wimmer and Stefan
                 Brunthaler and Per Larsen and Michael Franz",
  title =        "Safe and efficient hybrid memory management for
                 {Java}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "81--92",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754185",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Java uses automatic memory management, usually
                 implemented as a garbage-collected heap. That lifts the
                 burden of manually allocating and deallocating memory,
                 but it can incur significant runtime overhead and
                 increase the memory footprint of applications. We
                 propose a hybrid memory management scheme that utilizes
                 region-based memory management to deallocate objects
                 automatically on region exits. Static program analysis
                 detects allocation sites that are safe for region
                 allocation, i.e., the static analysis proves that the
                 objects allocated at such a site are not reachable
                 after the region exit. A regular garbage-collected heap
                 is used for objects that are not region allocatable.
                 The region allocation exploits the temporal locality of
                 object allocation. Our analysis uses coarse-grain
                 source code annotations to disambiguate objects with
                 non-overlapping lifetimes, and maps them to different
                 memory scopes. Region-allocated memory does not require
                 garbage collection as the regions are simply
                 deallocated when they go out of scope. The region
                 allocation technique is backed by a garbage collector
                 that manages memory that is not region allocated. We
                 provide a detailed description of the analysis, provide
                 experimental results showing that as much as 78\% of
                 the memory is region allocatable and discuss how our
                 hybrid memory management system can be implemented
                 efficiently with respect to both space and time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Miranda:2015:PRB,
  author =       "Eliot Miranda and Cl{\'e}ment B{\'e}ra",
  title =        "A partial read barrier for efficient support of live
                 object-oriented programming",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "93--104",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754186",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Live programming, originally introduced by Smalltalk
                 and Lisp, and now gaining popularity in contemporary
                 systems such as Swift, requires on-the-fly support for
                 object schema migration, such that the layout of
                 objects may be changed while the program is at one and
                 the same time being run and developed. In Smalltalk
                 schema migration is supported by two primitives, one
                 that answers a collection of all instances of a class,
                 and one that exchanges the identities of pairs of
                 objects, called the become primitive. Existing
                 instances are collected, copies using the new schema
                 created, state copied from old to new, and the two
                 exchanged with become, effecting the schema migration.
                 Historically the implementation of become has either
                 required an extra level of indirection between an
                 object's address and its body, slowing down slot
                 access, or has required a sweep of all objects, a very
                 slow operation on large heaps. Spur, a new object
                 representation and memory manager for Smalltalk-like
                 languages, has neither of these deficiencies. It uses
                 direct pointers but still provides a fast become
                 operation in large heaps, thanks to forwarding objects
                 that when read conceptually answer another object and a
                 partial read barrier that avoids the cost of explicitly
                 checking for forwarding objects on the vast majority of
                 object accesses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Clifford:2015:MMD,
  author =       "Daniel Clifford and Hannes Payer and Michael Stanton
                 and Ben L. Titzer",
  title =        "Memento mori: dynamic allocation-site-based
                 optimizations",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "105--117",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754181",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Languages that lack static typing are ubiquitous in
                 the world of mobile and web applications. The rapid
                 rise of larger applications like interactive web GUIs,
                 games, and cryptography presents a new range of
                 implementation challenges for modern virtual machines
                 to close the performance gap between typed and untyped
                 languages. While all languages can benefit from
                 efficient automatic memory management, languages like
                 JavaScript present extra thrill with innocent-looking
                 but difficult features like dynamically-sized arrays,
                 deletable properties, and prototypes. Optimizing such
                 languages requires complex dynamic techniques with more
                 radical object layout strategies such as dynamically
                 evolving representations for arrays. This paper
                 presents a general approach for gathering temporal
                 allocation site feedback that tackles both the general
                 problem of object lifetime estimation and improves
                 optimization of these problematic language features. We
                 introduce a new implementation technique where
                 allocation mementos processed by the garbage collector
                 and runtime system efficiently tie objects back to
                 allocation sites in the program and dynamically
                 estimate object lifetime, representation, and size to
                 inform three optimizations: pretenuring,
                 pretransitioning, and presizing. Unlike previous work
                 on pretenuring, our system utilizes allocation mementos
                 to achieve fully dynamic allocation-site-based
                 pretenuring in a production system. We implement all of
                 our techniques in V8, a high performance virtual
                 machine for JavaScript, and demonstrate solid
                 performance improvements across a range of
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Shidal:2015:RTC,
  author =       "Jonathan Shidal and Ari J. Spilo and Paul T. Scheid
                 and Ron K. Cytron and Krishna M. Kavi",
  title =        "Recycling trash in cache",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "118--130",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754183",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The disparity between processing and storage speeds
                 can be bridged in part by reducing the traffic into and
                 out of the slower memory components. Some recent
                 studies reduce such traffic by determining dead data in
                 cache, showing that a significant fraction of writes
                 can be squashed before they make the trip toward slower
                 memory. In this paper, we examine a technique for
                 eliminating traffic in the other direction,
                 specifically the traffic induced by dynamic storage
                 allocation. We consider recycling dead storage in cache
                 to satisfy a program's storage-allocation requests. We
                 first evaluate the potential for recycling under
                 favorable circumstances, where the associated logic can
                 run at full speed with no impact on the cache's normal
                 behavior. We then consider a more practical
                 implementation, in which the associated logic executes
                 independently from the cache's critical path. Here, the
                 cache's performance is unfettered by recycling, but the
                 operations necessary to determine dead storage and
                 recycle such storage execute as time is available.
                 Finally, we present the design and analysis of a
                 hardware implementation that scales well with cache
                 size without sacrificing too much performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Cutler:2015:RPT,
  author =       "Cody Cutler and Robert Morris",
  title =        "Reducing pause times with clustered collection",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "131--142",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754184",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Each full garbage collection in a program with
                 millions of objects can pause the program for multiple
                 seconds. Much of this work is typically repeated, as
                 the collector re-traces parts of the object graph that
                 have not changed since the last collection. Clustered
                 Collection reduces full collection pause times by
                 eliminating much of this repeated work. Clustered
                 Collection identifies clusters: regions of the object
                 graph that are reachable from a single ``head'' object,
                 so that reachability of the head implies reachability
                 of the whole cluster. As long as it is not written, a
                 cluster need not be re-traced by successive full
                 collections. The main design challenge is coping with
                 program writes to clusters while ensuring safe,
                 complete, and fast collections. In some cases program
                 writes require clusters to be dissolved, but in most
                 cases Clustered Collection can handle writes without
                 having to re-trace the affected cluster. Clustered
                 Collection chooses clusters likely to suffer few writes
                 and to yield high savings from re-trace avoidance.
                 Clustered Collection is implemented as modifications to
                 the Racket collector. Measurements of the code and data
                 from the Hacker News web site (which suffers from
                 significant garbage collection pauses) and a
                 Twitter-like application show that Clustered Collection
                 decreases full collection pause times by a factor of
                 three and six respectively. This improvement is
                 possible because both applications have gigabytes of
                 live data, modify only a small fraction of it, and
                 usually write in ways that do not result in cluster
                 dissolution. Identifying clusters takes more time than
                 a full collection, but happens much less frequently
                 than full collection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Cameron:2015:JFE,
  author =       "Callum Cameron and Jeremy Singer and David Vengerov",
  title =        "The judgment of {FORSETI}: economic utility for
                 dynamic heap sizing of multiple runtimes",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "11",
  pages =        "143--156",
  month =        nov,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887746.2754180",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "We introduce the FORSETI system, which is a principled
                 approach for holistic memory management. It permits a
                 sysadmin to specify the total physical memory resource
                 that may be shared between all concurrent virtual
                 machines on a physical node. FORSETI models the heap
                 size versus application throughput for each virtual
                 machine, and seeks to maximize the combined throughput
                 of the set of VMs based on concepts from economic
                 utility theory. We evaluate the FORSETI system using a
                 standard Java managed runtime, i.e. OpenJDK. Our
                 results demonstrate that FORSETI enables dramatic
                 reductions (up to 5x) in heap footprint without
                 compromising application execution times.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '15 conference proceedings.",
}

@Article{Diatchki:2015:IHT,
  author =       "Iavor S. Diatchki",
  title =        "Improving {Haskell} types with {SMT}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "1--10",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804307",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a technique for integrating GHC's
                 type-checker with an SMT solver. The technique was
                 developed to add support for reasoning about type-level
                 functions on natural numbers, and so our implementation
                 uses the theory of linear arithmetic. However, the
                 approach is not limited to this theory, and makes it
                 possible to experiment with other external decision
                 procedures, such as reasoning about type-level
                 booleans, bit-vectors, or any other theory supported by
                 SMT solvers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Gundry:2015:TPU,
  author =       "Adam Gundry",
  title =        "A typechecker plugin for units of measure:
                 domain-specific constraint solving in {GHC Haskell}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "11--22",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804305",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Typed functional programming and units of measure are
                 a natural combination, as F\# ably demonstrates.
                 However, encoding statically-checked units in Haskell's
                 type system leads to inevitable disappointment with the
                 usability of the resulting system. Extending the
                 language itself would produce a much better result, but
                 it would be a lot of work! In this paper, I demonstrate
                 how typechecker plugins in the Glasgow Haskell Compiler
                 allow users to define domain-specific constraint
                 solving behaviour, making it possible to implement
                 units of measure as a type system extension without
                 rebuilding the compiler. This paves the way for a more
                 modular treatment of constraint solving in GHC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Farmer:2015:RHT,
  author =       "Andrew Farmer and Neil Sculthorpe and Andy Gill",
  title =        "Reasoning with the {HERMIT}: tool support for
                 equational reasoning on {GHC} core programs",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "23--34",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804303",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A benefit of pure functional programming is that it
                 encourages equational reasoning. However, the Haskell
                 language has lacked direct tool support for such
                 reasoning. Consequently, reasoning about Haskell
                 programs is either performed manually, or in another
                 language that does provide tool support (e.g. Agda or
                 Coq). HERMIT is a Haskell-specific toolkit designed to
                 support equational reasoning and user-guided program
                 transformation, and to do so as part of the GHC
                 compilation pipeline. This paper describes HERMIT's
                 recently developed support for equational reasoning,
                 and presents two case studies of HERMIT usage: checking
                 that type-class laws hold for specific instance
                 declarations, and mechanising textbook equational
                 reasoning.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Breitner:2015:FPC,
  author =       "Joachim Breitner",
  title =        "Formally proving a compiler transformation safe",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "35--46",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804312",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We prove that the Call Arity analysis and
                 transformation, as implemented in the Haskell compiler
                 GHC, is safe, i.e. does not impede the performance of
                 the program. We formalized syntax, semantics, the
                 analysis and the transformation in the interactive
                 theorem prover Isabelle to obtain a machine-checked
                 proof and hence a level of rigor rarely obtained for
                 compiler optimization safety theorems. The proof is
                 modular and introduces trace trees as a suitable
                 abstraction in abstract cardinality analyses. We
                 discuss the breadth of the formalization gap.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Perez:2015:BGG,
  author =       "Ivan Perez and Henrik Nilsson",
  title =        "Bridging the {GUI} gap with reactive values and
                 relations",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "47--58",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804316",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There are at present two ways to write GUIs for
                 functional code. One is to use standard GUI toolkits,
                 with all the benefits they bring in terms of feature
                 completeness, choice of platform, conformance to
                 platform-specific look-and-feel, long-term viability,
                 etc. However, such GUI APIs mandate an imperative
                 programming style for the GUI and related parts of the
                 application. Alternatively, we can use a functional GUI
                 toolkit. The GUI can then be written in a functional
                 style, but at the cost of foregoing many advantages of
                 standard toolkits that often will be of critical
                 importance. This paper introduces a light-weight
                 framework structured around the notions of reactive
                 values and reactive relations. It allows standard
                 toolkits to be used from functional code written in a
                 functional style. We thus bridge the gap between the
                 two worlds, bringing the advantages of both to the
                 developer. Our framework is available on Hackage and
                 has been been validated through the development of
                 non-trivial applications in a commercial context, and
                 with different standard GUI toolkits.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Gill:2015:RMD,
  author =       "Andy Gill and Neil Sculthorpe and Justin Dawson and
                 Aleksander Eskilson and Andrew Farmer and Mark Grebe
                 and Jeffrey Rosenbluth and Ryan Scott and James
                 Stanton",
  title =        "The remote monad design pattern",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "59--70",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804311",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Remote Procedure Calls are expensive. This paper
                 demonstrates how to reduce the cost of calling remote
                 procedures from Haskell by using the remote monad
                 design pattern, which amortizes the cost of remote
                 calls. This gives the Haskell community access to
                 remote capabilities that are not directly supported, at
                 a surprisingly inexpensive cost. We explore the remote
                 monad design pattern through six models of remote
                 execution patterns, using a simulated Internet of
                 Things toaster as a running example. We consider the
                 expressiveness and optimizations enabled by each remote
                 execution model, and assess the feasibility of our
                 approach. We then present a full-scale case study: a
                 Haskell library that provides a Foreign Function
                 Interface to the JavaScript Canvas API. Finally, we
                 discuss existing instances of the remote monad design
                 pattern found in Haskell libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Morris:2015:VV,
  author =       "J. Garrett Morris",
  title =        "Variations on variants",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "71--81",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804320",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Extensible variants improve the modularity and
                 expressiveness of programming languages: they allow
                 program functionality to be decomposed into independent
                 blocks, and allow seamless extension of existing code
                 with both new cases of existing data types and new
                 operations over those data types. This paper considers
                 three approaches to providing extensible variants in
                 Haskell. Row typing is a long understood mechanism for
                 typing extensible records and variants, but its
                 adoption would require extension of Haskell's core type
                 system. Alternatively, we might hope to encode
                 extensible variants in terms of existing mechanisms,
                 such as type classes. We describe an encoding of
                 extensible variants using instance chains, a proposed
                 extension of the class system. Unlike many previous
                 encodings of extensible variants, ours does not require
                 the definition of a new type class for each function
                 that consumes variants. Finally, we translate our
                 encoding to use closed type families, an existing
                 feature of GHC. Doing so demonstrates the
                 interpretation of instances chains and functional
                 dependencies in closed type families. One concern with
                 encodings like ours is how completely they match the
                 encoded system. We compare the expressiveness of our
                 encodings with each other and with systems based on row
                 types. We find that, while equivalent terms are typable
                 in each system, both encodings require explicit type
                 annotations to resolve ambiguities in typing not
                 present in row type systems, and the type family
                 implementation retains more constraints in principal
                 types than does the instance chain implementation. We
                 propose a general mechanism to guide the instantiation
                 of ambiguous type variables, show that it eliminates
                 the need for type annotations in our encodings, and
                 discuss conditions under which it preserves
                 coherence.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Oliveira:2015:MRM,
  author =       "Bruno C. d. S. Oliveira and Shin-Cheng Mu and Shu-Hung
                 You",
  title =        "Modular reifiable matching: a list-of-functors
                 approach to two-level types",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "82--93",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804315",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "This paper presents Modular Reifiable Matching (MRM):
                 a new approach to two level types using a fixpoint of
                 list-of-functors representation. MRM allows the modular
                 definition of datatypes and functions by pattern
                 matching, using a style similar to the widely popular
                 Datatypes a la Carte (DTC) approach. However, unlike
                 DTC, MRM uses a fixpoint of list-of-functors approach
                 to two-level types. This approach has advantages that
                 help with various aspects of extensibility, modularity
                 and reuse. Firstly, modular pattern matching
                 definitions are collected using a list of matches that
                 is fully reifiable. This allows for extensible pattern
                 matching definitions to be easily reused/inherited, and
                 particular matches to be overridden. Such flexibility
                 is used, among other things, to implement extensible
                 generic traversals. Secondly, the subtyping relation
                 between lists of functors is quite simple, does not
                 require backtracking, and is easy to model in languages
                 like Haskell. MRM is implemented as a Haskell library,
                 and its use and applicability are illustrated through
                 various examples in the paper.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Kiselyov:2015:FMM,
  author =       "Oleg Kiselyov and Hiromi Ishii",
  title =        "Freer monads, more extensible effects",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "94--105",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804319",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a rational reconstruction of extensible
                 effects, the recently proposed alternative to monad
                 transformers, as the confluence of efforts to make
                 effectful computations compose. Free monads and then
                 extensible effects emerge from the straightforward term
                 representation of an effectful computation, as more and
                 more boilerplate is abstracted away. The generalization
                 process further leads to freer monads, constructed
                 without the Functor constraint. The continuation
                 exposed in freer monads can then be represented as an
                 efficient type-aligned data structure. The end result
                 is the algorithmically efficient extensible effects
                 library, which is not only more comprehensible but also
                 faster than earlier implementations. As an illustration
                 of the new library, we show three surprisingly simple
                 applications: non-determinism with committed choice
                 (LogicT), catching IO exceptions in the presence of
                 other effects, and the semi-automatic management of
                 file handles and other resources through monadic
                 regions. We extensively use and promote the new sort of
                 `laziness', which underlies the left Kan extension:
                 instead of performing an operation, keep its operands
                 and pretend it is done.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Foner:2015:FPG,
  author =       "Kenneth Foner",
  title =        "Functional pearl: getting a quick fix on comonads",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "106--117",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804310",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A piece of functional programming folklore due to
                 Piponi provides L{\"o}b's theorem from modal
                 provability logic with a computational interpretation
                 as an unusual fixed point. Interpreting modal necessity
                 as an arbitrary Functor in Haskell, the ``type'' of
                 L{\"o}b's theorem is inhabited by a fixed point
                 function allowing each part of a structure to refer to
                 the whole. However, Functor's logical interpretation
                 may be used to prove L{\"o}b's theorem only by relying
                 on its implicit functorial strength, an axiom not
                 available in the provability modality. As a result, the
                 well known Loeb fixed point ``cheats'' by using
                 functorial strength to implement its recursion. Rather
                 than Functor, a closer Curry analogue to modal logic's
                 Howard inspiration is a closed (semi-)comonad, of which
                 Haskell's ComonadApply typeclass provides analogous
                 structure. Its computational interpretation permits the
                 definition of a novel fixed point function allowing
                 each part of a structure to refer to its own context
                 within the whole. This construction further guarantees
                 maximal sharing and asymptotic efficiency superior to
                 Loeb for locally contextual computations upon a large
                 class of structures. With the addition of a
                 distributive law, closed comonads may be composed into
                 spaces of arbitrary dimensionality while preserving the
                 performance guarantees of this new fixed point. From
                 these elements, we construct a small embedded
                 domain-specific language to elegantly express and
                 evaluate multidimensional ``spreadsheet-like''
                 recurrences for a variety of cellular automata.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Stolarek:2015:ITF,
  author =       "Jan Stolarek and Simon Peyton Jones and Richard A.
                 Eisenberg",
  title =        "Injective type families for {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "118--128",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804314",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Haskell, as implemented by the Glasgow Haskell
                 Compiler (GHC), allows expressive type-level
                 programming. The most popular type-level programming
                 extension is TypeFamilies, which allows users to write
                 functions on types. Yet, using type functions can
                 cripple type inference in certain situations. In
                 particular, lack of injectivity in type functions means
                 that GHC can never infer an instantiation of a type
                 variable appearing only under type functions. In this
                 paper, we describe a small modification to GHC that
                 allows type functions to be annotated as injective. GHC
                 naturally must check validity of the injectivity
                 annotations. The algorithm to do so is surprisingly
                 subtle. We prove soundness for a simplification of our
                 algorithm, and state and prove a completeness property,
                 though the algorithm is not fully complete. As much of
                 our reasoning surrounds functions defined by a simple
                 pattern-matching structure, we believe our results
                 extend beyond just Haskell. We have implemented our
                 solution on a branch of GHC and plan to make it
                 available to regular users with the next stable release
                 of the compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Serrano:2015:TFC,
  author =       "Alejandro Serrano and Jurriaan Hage and Patrick Bahr",
  title =        "Type families with class, type classes with family",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "129--140",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804304",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Type classes and type families are key ingredients in
                 Haskell programming. Type classes were introduced to
                 deal with ad-hoc polymorphism, although with the
                 introduction of functional dependencies, their use
                 expanded to type-level programming. Type families also
                 allow encoding type-level functions, but more directly
                 in the form of rewrite rules. In this paper we show
                 that type families are powerful enough to simulate type
                 classes (without overlapping instances), and we provide
                 a formal proof of the soundness and completeness of
                 this simulation. Encoding instance constraints as type
                 families eases the path to proposed extensions to type
                 classes, like closed sets of instances, instance
                 chains, and control over the search procedure. The only
                 feature which type families cannot simulate is
                 elaboration, that is, generating code from the
                 derivation of a rewriting. We look at ways to solve
                 this problem in current Haskell, and propose an
                 extension to allow elaboration during the rewriting
                 phase.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Walker:2015:DFC,
  author =       "Michael Walker and Colin Runciman",
  title =        "{D{\'e}j{\`a} Fu}: a concurrency testing library for
                 {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "141--152",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804306",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Systematic concurrency testing (SCT) is an approach to
                 testing potentially nondeterministic concurrent
                 programs. SCT avoids potentially unrepeatable results
                 that may arise from unit testing concurrent programs.
                 It seems to have received little attention from Haskell
                 programmers. This paper introduces a generalisation of
                 Haskell's concurrency abstraction in the form of
                 typeclasses, and a library for testing concurrent
                 programs. A number of examples are provided, some of
                 which come from pre-existing packages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Trilla:2015:IIP,
  author =       "Jos{\'e} Manuel Calder{\'o}n Trilla and Colin
                 Runciman",
  title =        "Improving implicit parallelism",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "153--164",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804308",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Using static analysis techniques compilers for lazy
                 functional languages can be used to identify parts of a
                 program that can be legitimately evaluated in parallel
                 and ensure that those expressions are executed
                 concurrently with the main thread of execution. These
                 techniques can produce improvements in the runtime
                 performance of a program, but are limited by the static
                 analyses' poor prediction of runtime performance. This
                 paper outlines the development of a system that uses
                 iterative profile-directed improvement in addition to
                 well-studied static analysis techniques. This allows us
                 to achieve higher performance gains than through static
                 analysis alone.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Scibior:2015:PPP,
  author =       "Adam {\'S}cibior and Zoubin Ghahramani and Andrew D.
                 Gordon",
  title =        "Practical probabilistic programming with monads",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "165--176",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804317",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The machine learning community has recently shown a
                 lot of interest in practical probabilistic programming
                 systems that target the problem of Bayesian inference.
                 Such systems come in different forms, but they all
                 express probabilistic models as computational processes
                 using syntax resembling programming languages. In the
                 functional programming community monads are known to
                 offer a convenient and elegant abstraction for
                 programming with probability distributions, but their
                 use is often limited to very simple inference problems.
                 We show that it is possible to use the monad
                 abstraction to construct probabilistic models for
                 machine learning, while still offering good performance
                 of inference in challenging models. We use a GADT as an
                 underlying representation of a probability distribution
                 and apply Sequential Monte Carlo-based methods to
                 achieve efficient inference. We define a formal
                 semantics via measure theory. We demonstrate a clean
                 and elegant implementation that achieves performance
                 comparable with Anglican, a state-of-the-art
                 probabilistic programming system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Polakow:2015:EFL,
  author =       "Jeff Polakow",
  title =        "Embedding a full linear lambda calculus in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "177--188",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804309",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an encoding of full linear lambda calculus
                 in Haskell using higher order abstract syntax. By
                 making use of promoted data kinds, multi-parameter type
                 classes and functional dependencies, the encoding
                 allows Haskell to do both linear type checking and
                 linear type inference.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{Elliott:2015:GFI,
  author =       "Trevor Elliott and Lee Pike and Simon Winwood and Pat
                 Hickey and James Bielman and Jamey Sharp and Eric
                 Seidel and John Launchbury",
  title =        "Guilt free {Ivory}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "189--200",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804318",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Ivory is a language that enforces memory safety and
                 avoids most undefined behaviors while providing
                 low-level control of memory-manipulation. Ivory is
                 embedded in a modern variant of Haskell, as implemented
                 by the GHC compiler. The main contributions of the
                 paper are two-fold. First, we demonstrate how to embed
                 the type-system of a safe-C language into the type
                 extensions of GHC. Second, Ivory is of interest in its
                 own right, as a powerful language for writing
                 high-assurance embedded programs. Beyond invariants
                 enforced by its type-system, Ivory has direct support
                 for model-checking, theorem-proving, and property-based
                 testing. Ivory's semantics have been formalized and
                 proved to guarantee memory safety.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{McDonell:2015:TSR,
  author =       "Trevor L. McDonell and Manuel M. T. Chakravarty and
                 Vinod Grover and Ryan R. Newton",
  title =        "Type-safe runtime code generation: accelerate to
                 {LLVM}",
  journal =      j-SIGPLAN,
  volume =       "50",
  number =       "12",
  pages =        "201--212",
  month =        dec,
  year =         "2015",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2887747.2804313",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Tue Feb 16 12:01:44 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Embedded languages are often compiled at application
                 runtime; thus, embedded compile-time errors become
                 application runtime errors. We argue that advanced type
                 system features, such as GADTs and type families, play
                 a crucial role in minimising such runtime errors.
                 Specifically, a rigorous type discipline reduces
                 runtime errors due to bugs in both embedded language
                 applications and the implementation of the embedded
                 language compiler itself. In this paper, we focus on
                 the safety guarantees achieved by type preserving
                 compilation. We discuss the compilation pipeline of
                 Accelerate, a high-performance array language targeting
                 both multicore CPUs and GPUs, where we are able to
                 preserve types from the source language down to a
                 low-level register language in SSA form. Specifically,
                 we demonstrate the practicability of our approach by
                 creating a new type-safe interface to the
                 industrial-strength LLVM compiler infrastructure, which
                 we used to build two new Accelerate backends that show
                 competitive runtimes on a set of benchmarks across both
                 CPUs and GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '15 conference proceedings.",
}

@Article{McKinley:2016:PWU,
  author =       "Kathryn S. McKinley",
  title =        "Programming the world of uncertain things (keynote)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "1--2",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2843895",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computing has entered the era of uncertain data, in
                 which hardware and software generate and reason about
                 estimates. Applications use estimates from sensors,
                 machine learning, big data, humans, and approximate
                 hardware and software. Unfortunately, developers face
                 pervasive correctness, programmability, and
                 optimization problems due to estimates. Most
                 programming languages unfortunately make these problems
                 worse. We propose a new programming abstraction called
                 {Uncertain$<$T$>$} embedded into languages, such as
                 C\#, C++, Java, Python, and JavaScript. Applications
                 that consume estimates use familiar discrete operations
                 for their estimates; overloaded conditional operators
                 specify hypothesis tests and applications use them
                 control false positives and negatives; and new
                 compositional operators express domain knowledge. By
                 carefully restricting the expressiveness, the runtime
                 automatically implements correct statistical reasoning
                 at conditionals, relieving developers of the need to
                 implement or deeply understand statistics. We
                 demonstrate substantial programmability, correctness,
                 and efficiency benefits of this programming model for
                 GPS sensor navigation, approximate computing, machine
                 learning, and xBox.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Murray:2016:SRC,
  author =       "Richard M. Murray",
  title =        "Synthesis of reactive controllers for hybrid systems
                 (keynote)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "3--3",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2843894",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Decision-making logic in hybrid systems is responsible
                 for selecting modes of operation for the underlying
                 (continuous) control system, reacting to external
                 events and failures in the system, and insuring that
                 the overall control system is satisfying safety and
                 performance specifications. Tools from computer
                 science, such as model-checking and logic synthesis,
                 combined with design patterns from feedback control
                 theory provide new approaches to solving these
                 problems. A major shift is the move from ``design then
                 verify'' to ``specify then synthesize'' approaches to
                 controller design that allow simultaneous synthesis of
                 high-performance, robust control laws and
                 correct-by-construction decision-making logic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Walker:2016:CPL,
  author =       "David Walker",
  title =        "Confluences in programming languages research
                 (keynote)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "4--4",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2843896",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A confluence occurs when two rivers flow together;
                 downstream the combined forces gather strength and
                 propel their waters forward with increased vigor. In
                 academic research, according to Varghese, a confluence
                 occurs after some trigger, perhaps a discovery or a
                 change in technology, and brings two previously
                 separate branches of research together. In this talk, I
                 will discuss confluences in programming languages
                 research. Here, confluences often occur when basic
                 research finds application in some important new
                 domain. Two prime examples from my own career involve
                 the confluence of research in type theory and systems
                 security, triggered by new theoretical tools for
                 reasoning about programming language safety, and the
                 confluence of formal methods and networking, triggered
                 by the rise of data centers. These experiences may shed
                 light on what to teach our students and what is next
                 for programming languages research.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Brown:2016:BTN,
  author =       "Matt Brown and Jens Palsberg",
  title =        "Breaking through the normalization barrier: a
                 self-interpreter for {F$_{\rm omega}$}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "5--17",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837623",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "According to conventional wisdom, a self-interpreter
                 for a strongly normalizing lambda-calculus is
                 impossible. We call this the normalization barrier. The
                 normalization barrier stems from a theorem in
                 computability theory that says that a total universal
                 function for the total computable functions is
                 impossible. In this paper we break through the
                 normalization barrier and define a self-interpreter for
                 System F_omega, a strongly normalizing lambda-calculus.
                 After a careful analysis of the classical theorem, we
                 show that static type checking in F_omega can exclude
                 the proof's diagonalization gadget, leaving open the
                 possibility for a self-interpreter. Along with the
                 self-interpreter, we program four other operations in
                 F_omega, including a continuation-passing style
                 transformation. Our operations rely on a new approach
                 to program representation that may be useful in theorem
                 provers and compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Altenkirch:2016:TTT,
  author =       "Thorsten Altenkirch and Ambrus Kaposi",
  title =        "Type theory in type theory using quotient inductive
                 types",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "18--29",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837638",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an internal formalisation of a type heory
                 with dependent types in Type Theory using a special
                 case of higher inductive types from Homotopy Type
                 Theory which we call quotient inductive types (QITs).
                 Our formalisation of type theory avoids referring to
                 preterms or a typability relation but defines directly
                 well typed objects by an inductive definition. We use
                 the elimination principle to define the set-theoretic
                 and logical predicate interpretation. The work has been
                 formalized using the Agda system extended with QITs
                 using postulates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Cai:2016:SFE,
  author =       "Yufei Cai and Paolo G. Giarrusso and Klaus Ostermann",
  title =        "System {F$_{\rm omega}$} with equirecursive types for
                 datatype-generic programming",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "30--43",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837660",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Traversing an algebraic datatype by hand requires
                 boilerplate code which duplicates the structure of the
                 datatype. Datatype-generic programming (DGP) aims to
                 eliminate such boilerplate code by decomposing
                 algebraic datatypes into type constructor applications
                 from which generic traversals can be synthesized.
                 However, different traversals require different
                 decompositions, which yield isomorphic but unequal
                 types. This hinders the interoperability of different
                 DGP techniques. In this paper, we propose F \omega \mu
                 , an extension of the higher-order polymorphic lambda
                 calculus F \omega with records, variants, and
                 equirecursive types. We prove the soundness of the type
                 system, and show that type checking for first-order
                 recursive types is decidable with a practical type
                 checking algorithm. In our soundness proof we define
                 type equality by interpreting types as infinitary
                 \lambda -terms (in particular, Berarducci-trees). To
                 decide type equality we \beta -normalize types, and
                 then use an extension of equivalence checking for usual
                 equirecursive types. Thanks to equirecursive types, new
                 decompositions for a datatype can be added modularly
                 and still interoperate with each other, allowing
                 multiple DGP techniques to work together. We sketch how
                 generic traversals can be synthesized, and apply these
                 components to some examples. Since the set of datatype
                 decomposition becomes extensible, System F \omega \mu
                 enables using DGP techniques incrementally, instead of
                 planning for them upfront or doing invasive
                 refactoring.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Curien:2016:TER,
  author =       "Pierre-Louis Curien and Marcelo Fiore and Guillaume
                 Munch-Maccagnoni",
  title =        "A theory of effects and resources: adjunction models
                 and polarised calculi",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "44--56",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837652",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We consider the Curry--Howard-Lambek correspondence
                 for effectful computation and resource management,
                 specifically proposing polarised calculi together with
                 presheaf-enriched adjunction models as the starting
                 point for a comprehensive semantic theory relating
                 logical systems, typed calculi, and categorical models
                 in this context. Our thesis is that the combination of
                 effects and resources should be considered
                 orthogonally. Model theoretically, this leads to an
                 understanding of our categorical models from two
                 complementary perspectives: (i) as a linearisation of
                 CBPV (Call-by-Push-Value) adjunction models, and (ii)
                 as an extension of linear/non-linear adjunction models
                 with an adjoint resolution of computational effects.
                 When the linear structure is cartesian and the resource
                 structure is trivial we recover Levy's notion of CBPV
                 adjunction model, while when the effect structure is
                 trivial we have Benton's linear/non-linear adjunction
                 models. Further instances of our model theory include
                 the dialogue categories with a resource modality of
                 Melli{\`e}s and Tabareau, and the [E]EC ([Enriched]
                 Effect Calculus) models of Egger, M{\o}gelberg and
                 Simpson. Our development substantiates the approach by
                 providing a lifting theorem of linear models into
                 cartesian ones. To each of our categorical models we
                 systematically associate a typed term calculus, each of
                 which corresponds to a variant of the sequent calculi
                 LJ (Intuitionistic Logic) or ILL (Intuitionistic Linear
                 Logic). The adjoint resolution of effects corresponds
                 to polarisation whereby, syntactically, types locally
                 determine a strict or lazy evaluation order and,
                 semantically, the associativity of cuts is relaxed. In
                 particular, our results show that polarisation provides
                 a computational interpretation of CBPV in direct style.
                 Further, we characterise depolarised models: those
                 where the cut is associative, and where the evaluation
                 order is unimportant. We explain possible advantages of
                 this style of calculi for the operational semantics of
                 effects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Murase:2016:TVH,
  author =       "Akihiro Murase and Tachio Terauchi and Naoki Kobayashi
                 and Ryosuke Sato and Hiroshi Unno",
  title =        "Temporal verification of higher-order functional
                 programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "57--68",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837667",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an automated approach to verifying
                 arbitrary omega-regular properties of higher-order
                 functional programs. Previous automated methods
                 proposed for this class of programs could only handle
                 safety properties or termination, and our approach is
                 the first to be able to verify arbitrary omega-regular
                 liveness properties. Our approach is
                 automata-theoretic, and extends our recent work on
                 binary-reachability-based approach to automated
                 termination verification of higher-order functional
                 programs to fair termination published in ESOP 2014. In
                 that work, we have shown that checking disjunctive
                 well-foundedness of (the transitive closure of) the
                 ``calling relation'' is sound and complete for
                 termination. The extension to fair termination is
                 tricky, however, because the straightforward extension
                 that checks disjunctive well-foundedness of the fair
                 calling relation turns out to be unsound, as we shall
                 show in the paper. Roughly, our solution is to check
                 fairness on the transition relation instead of the
                 calling relation, and propagate the information to
                 determine when it is necessary and sufficient to check
                 for disjunctive well-foundedness on the calling
                 relation. We prove that our approach is sound and
                 complete. We have implemented a prototype of our
                 approach, and confirmed that it is able to
                 automatically verify liveness properties of some
                 non-trivial higher-order programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Plotkin:2016:SNV,
  author =       "Gordon D. Plotkin and Nikolaj Bj{\o}rner and Nuno P.
                 Lopes and Andrey Rybalchenko and George Varghese",
  title =        "Scaling network verification using symmetry and
                 surgery",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "69--83",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837657",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "On the surface, large data centers with about 100,000
                 stations and nearly a million routing rules are complex
                 and hard to verify. However, these networks are highly
                 regular by design; for example they employ fat tree
                 topologies with backup routers interconnected by
                 redundant patterns. To exploit these regularities, we
                 introduce network transformations: given a reachability
                 formula and a network, we transform the network into a
                 simpler to verify network and a corresponding
                 transformed formula, such that the original formula is
                 valid in the network if and only if the transformed
                 formula is valid in the transformed network. Our
                 network transformations exploit network surgery (in
                 which irrelevant or redundant sets of nodes, headers,
                 ports, or rules are ``sliced'' away) and network
                 symmetry (say between backup routers). The validity of
                 these transformations is established using a formal
                 theory of networks. In particular, using Van
                 Benthem--Hennessy--Milner style bisimulation, we show
                 that one can generally associate bisimulations to
                 transformations connecting networks and formulas with
                 their transforms. Our work is a development in an area
                 of current wide interest: applying programming language
                 techniques (in our case bisimulation and modal logic)
                 to problems in switching networks. We provide
                 experimental evidence that our network transformations
                 can speed up by 65x the task of verifying the
                 communication between all pairs of Virtual Machines in
                 a large datacenter network with about 100,000 VMs. An
                 all-pair reachability calculation, which formerly took
                 5.5 days, can be done in 2 hours, and can be easily
                 parallelized to complete in minutes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Brotherston:2016:MCS,
  author =       "James Brotherston and Nikos Gorogiannis and Max
                 Kanovich and Reuben Rowe",
  title =        "Model checking for symbolic-heap separation logic with
                 inductive predicates",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "84--96",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837621",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We investigate the *model checking* problem for
                 symbolic-heap separation logic with user-defined
                 inductive predicates, i.e., the problem of checking
                 that a given stack-heap memory state satisfies a given
                 formula in this language, as arises e.g. in software
                 testing or runtime verification. First, we show that
                 the problem is *decidable*; specifically, we present a
                 bottom-up fixed point algorithm that decides the
                 problem and runs in exponential time in the size of the
                 problem instance. Second, we show that, while model
                 checking for the full language is EXPTIME-complete, the
                 problem becomes NP-complete or PTIME-solvable when we
                 impose natural syntactic restrictions on the schemata
                 defining the inductive predicates. We additionally
                 present NP and PTIME algorithms for these restricted
                 fragments. Finally, we report on the experimental
                 performance of our procedures on a variety of
                 specifications extracted from programs, exercising
                 multiple combinations of syntactic restrictions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Koskinen:2016:RCR,
  author =       "Eric Koskinen and Junfeng Yang",
  title =        "Reducing crash recoverability to reachability",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "97--108",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837648",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software applications run on a variety of platforms
                 (filesystems, virtual slices, mobile hardware, etc.)
                 that do not provide 100\% uptime. As such, these
                 applications may crash at any unfortunate moment losing
                 volatile data and, when re-launched, they must be able
                 to correctly recover from potentially inconsistent
                 states left on persistent storage. From a verification
                 perspective, crash recovery bugs can be particularly
                 frustrating because, even when it has been formally
                 proved for a program that it satisfies a property, the
                 proof is foiled by these external events that crash and
                 restart the program. In this paper we first provide a
                 hierarchical formal model of what it means for a
                 program to be crash recoverable. Our model captures the
                 recoverability of many real world programs, including
                 those in our evaluation which use sophisticated
                 recovery algorithms such as shadow paging and
                 write-ahead logging. Next, we introduce a novel
                 technique capable of automatically proving that a
                 program correctly recovers from a crash via a reduction
                 to reachability. Our technique takes an input
                 control-flow automaton and transforms it into an
                 encoding that blends the capture of snapshots of
                 pre-crash states into a symbolic search for a proof
                 that recovery terminates and every recovered execution
                 simulates some crash-free execution. Our encoding is
                 designed to enable one to apply existing abstraction
                 techniques in order to do the work that is necessary to
                 prove recoverability. We have implemented our technique
                 in a tool called Eleven82, capable of analyzing C
                 programs to detect recoverability bugs or prove their
                 absence. We have applied our tool to benchmark examples
                 drawn from industrial file systems and databases,
                 including GDBM, LevelDB, LMDB, PostgreSQL, SQLite,
                 VMware and ZooKeeper. Within minutes, our tool is able
                 to discover bugs or prove that these fragments are
                 crash recoverable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Zhang:2016:QGM,
  author =       "Xin Zhang and Ravi Mangal and Aditya V. Nori and Mayur
                 Naik",
  title =        "Query-guided maximum satisfiability",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "109--122",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837658",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a new optimization problem ``Q-MaxSAT'', an
                 extension of the well-known Maximum Satisfiability or
                 MaxSAT problem. In contrast to MaxSAT, which aims to
                 find an assignment to all variables in the formula,
                 Q-MaxSAT computes an assignment to a desired subset of
                 variables (or queries) in the formula. Indeed, many
                 problems in diverse domains such as program reasoning,
                 information retrieval, and mathematical optimization
                 can be naturally encoded as Q-MaxSAT instances. We
                 describe an iterative algorithm for solving Q-MaxSAT.
                 In each iteration, the algorithm solves a subproblem
                 that is relevant to the queries, and applies a novel
                 technique to check whether the partial assignment found
                 is a solution to the Q-MaxSAT problem. If the check
                 fails, the algorithm grows the subproblem with a new
                 set of clauses identified as relevant to the queries.
                 Our empirical evaluation shows that our Q-MaxSAT solver
                 Pilot achieves significant improvements in runtime and
                 memory consumption over conventional MaxSAT solvers on
                 several Q-MaxSAT instances generated from real-world
                 problems in program analysis and information
                 retrieval.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Lin:2016:SSW,
  author =       "Anthony W. Lin and Pablo Barcel{\'o}",
  title =        "String solving with word equations and transducers:
                 towards a logic for analysing mutation {XSS}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "123--136",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837641",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study the fundamental issue of decidability of
                 satisfiability over string logics with concatenations
                 and finite-state transducers as atomic operations.
                 Although restricting to one type of operations yields
                 decidability, little is known about the decidability of
                 their combined theory, which is especially relevant
                 when analysing security vulnerabilities of dynamic web
                 pages in a more realistic browser model. On the one
                 hand, word equations (string logic with concatenations)
                 cannot precisely capture sanitisation functions (e.g.
                 htmlescape) and implicit browser transductions (e.g.
                 innerHTML mutations). On the other hand, transducers
                 suffer from the reverse problem of being able to model
                 sanitisation functions and browser transductions, but
                 not string concatenations. Naively combining word
                 equations and transducers easily leads to an
                 undecidable logic. Our main contribution is to show
                 that the ``straight-line fragment'' of the logic is
                 decidable (complexity ranges from PSPACE to EXPSPACE).
                 The fragment can express the program logics of
                 straight-line string-manipulating programs with
                 concatenations and transductions as atomic operations,
                 which arise when performing bounded model checking or
                 dynamic symbolic executions. We demonstrate that the
                 logic can naturally express constraints required for
                 analysing mutation XSS in web applications. Finally,
                 the logic remains decidable in the presence of length,
                 letter-counting, regular, indexOf, and disequality
                 constraints.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Cardelli:2016:SCD,
  author =       "Luca Cardelli and Mirco Tribastone and Max
                 Tschaikowski and Andrea Vandin",
  title =        "Symbolic computation of differential equivalences",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "137--150",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837649",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Ordinary differential equations (ODEs) are widespread
                 in many natural sciences including chemistry, ecology,
                 and systems biology, and in disciplines such as control
                 theory and electrical engineering. Building on the
                 celebrated molecules-as-processes paradigm, they have
                 become increasingly popular in computer science, with
                 high-level languages and formal methods such as Petri
                 nets, process algebra, and rule-based systems that are
                 interpreted as ODEs. We consider the problem of
                 comparing and minimizing ODEs automatically. Influenced
                 by traditional approaches in the theory of programming,
                 we propose differential equivalence relations. We study
                 them for a basic intermediate language, for which we
                 have decidability results, that can be targeted by a
                 class of high-level specifications. An ODE implicitly
                 represents an uncountable state space, hence reasoning
                 techniques cannot be borrowed from established domains
                 such as probabilistic programs with finite-state Markov
                 chain semantics. We provide novel symbolic procedures
                 to check an equivalence and compute the largest one via
                 partition refinement algorithms that use satisfiability
                 modulo theories. We illustrate the generality of our
                 framework by showing that differential equivalences
                 include (i) well-known notions for the minimization of
                 continuous-time Markov chains (lumpability),
                 (ii)~bisimulations for chemical reaction networks
                 recently proposed by Cardelli et al., and (iii)
                 behavioral relations for process algebra with ODE
                 semantics. With a prototype implementation we are able
                 to detect equivalences in biochemical models from the
                 literature that cannot be reduced using competing
                 automatic techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Hague:2016:UDC,
  author =       "Matthew Hague and Jonathan Kochems and C.-H. Luke
                 Ong",
  title =        "Unboundedness and downward closures of higher-order
                 pushdown automata",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "151--163",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837627",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We show the diagonal problem for higher-order pushdown
                 automata (HOPDA), and hence the simultaneous
                 unboundedness problem, is decidable. From recent work
                 by Zetzsche this means that we can construct the
                 downward closure of the set of words accepted by a
                 given HOPDA. This also means we can construct the
                 downward closure of the Parikh image of a HOPDA. Both
                 of these consequences play an important role in
                 verifying concurrent higher-order programs expressed as
                 HOPDA or safe higher-order recursion schemes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Devriese:2016:FAC,
  author =       "Dominique Devriese and Marco Patrignani and Frank
                 Piessens",
  title =        "Fully-abstract compilation by approximate
                 back-translation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "164--177",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837618",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A compiler is fully-abstract if the compilation from
                 source language programs to target language programs
                 reflects and preserves behavioural equivalence. Such
                 compilers have important security benefits, as they
                 limit the power of an attacker interacting with the
                 program in the target language to that of an attacker
                 interacting with the program in the source language.
                 Proving compiler full-abstraction is, however, rather
                 complicated. A common proof technique is based on the
                 back-translation of target-level program contexts to
                 behaviourally-equivalent source-level contexts.
                 However, constructing such a back-translation is
                 problematic when the source language is not strong
                 enough to embed an encoding of the target language. For
                 instance, when compiling from the simply-typed $
                 \lambda $-calculus ($ \lambda \tau $) to the untyped $
                 \lambda $-calculus ($ \lambda u$), the lack of
                 recursive types in \lambda \tau prevents such a
                 back-translation. We propose a general and elegant
                 solution for this problem. The key insight is that it
                 suffices to construct an approximate back-translation.
                 The approximation is only accurate up to a certain
                 number of steps and conservative beyond that, in the
                 sense that the context generated by the
                 back-translation may diverge when the original would
                 not, but not vice versa. Based on this insight, we
                 describe a general technique for proving compiler
                 full-abstraction and demonstrate it on a compiler from
                 $ \lambda \tau $ to $ \lambda u$. The proof uses
                 asymmetric cross-language logical relations and makes
                 innovative use of step-indexing to express the relation
                 between a context and its approximate back-translation.
                 We believe this proof technique can scale to
                 challenging settings and enable simpler, more scalable
                 proofs of compiler full-abstraction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Kang:2016:LVS,
  author =       "Jeehoon Kang and Yoonseung Kim and Chung-Kil Hur and
                 Derek Dreyer and Viktor Vafeiadis",
  title =        "Lightweight verification of separate compilation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "178--190",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837642",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Major compiler verification efforts, such as the
                 CompCert project, have traditionally simplified the
                 verification problem by restricting attention to the
                 correctness of whole-program compilation, leaving open
                 the question of how to verify the correctness of
                 separate compilation. Recently, a number of
                 sophisticated techniques have been proposed for proving
                 more flexible, compositional notions of compiler
                 correctness, but these approaches tend to be quite
                 heavyweight compared to the simple ``closed
                 simulations'' used in verifying whole-program
                 compilation. Applying such techniques to a compiler
                 like CompCert, as Stewart et al. have done, involves
                 major changes and extensions to its original
                 verification. In this paper, we show that if we aim
                 somewhat lower---to prove correctness of separate
                 compilation, but only for a *single* compiler---we can
                 drastically simplify the proof effort. Toward this end,
                 we develop several lightweight techniques that recast
                 the compositional verification problem in terms of
                 whole-program compilation, thereby enabling us to
                 largely reuse the closed-simulation proofs from
                 existing compiler verifications. We demonstrate the
                 effectiveness of these techniques by applying them to
                 CompCert 2.4, converting its verification of
                 whole-program compilation into a verification of
                 separate compilation in less than two person-months.
                 This conversion only required a small number of changes
                 to the original proofs, and uncovered two compiler bugs
                 along the way. The result is SepCompCert, the first
                 verification of separate compilation for the full
                 CompCert compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Robbins:2016:MMS,
  author =       "Ed Robbins and Andy King and Tom Schrijvers",
  title =        "From {MinX} to {MinC}: semantics-driven decompilation
                 of recursive datatypes",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "191--203",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837633",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reconstructing the meaning of a program from its
                 binary executable is known as reverse engineering; it
                 has a wide range of applications in software security,
                 exposing piracy, legacy systems, etc. Since reversing
                 is ultimately a search for meaning, there is much
                 interest in inferring a type (a meaning) for the
                 elements of a binary in a consistent way. Unfortunately
                 existing approaches do not guarantee any semantic
                 relevance for their reconstructed types. This paper
                 presents a new and semantically-founded approach that
                 provides strong guarantees for the reconstructed types.
                 Key to our approach is the derivation of a witness
                 program in a high-level language alongside the
                 reconstructed types. This witness has the same
                 semantics as the binary, is type correct by
                 construction, and it induces a (justifiable) type
                 assignment on the binary. Moreover, the approach
                 effectively yields a type-directed decompiler. We
                 formalise and implement the approach for reversing
                 MinX, an abstraction of x86, to MinC, a type-safe
                 dialect of C with recursive datatypes. Our evaluation
                 compiles a range of textbook C algorithms to MinX and
                 then recovers the original structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Lorenzen:2016:STD,
  author =       "Florian Lorenzen and Sebastian Erdweg",
  title =        "Sound type-dependent syntactic language extension",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "204--216",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837644",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Syntactic language extensions can introduce new
                 facilities into a programming language while requiring
                 little implementation effort and modest changes to the
                 compiler. It is typical to desugar language extensions
                 in a distinguished compiler phase after parsing or type
                 checking, not affecting any of the later compiler
                 phases. If desugaring happens before type checking, the
                 desugaring cannot depend on typing information and type
                 errors are reported in terms of the generated code. If
                 desugaring happens after type checking, the code
                 generated by the desugaring is not type checked and may
                 introduce vulnerabilities. Both options are
                 undesirable. We propose a system for syntactic
                 extensibility where desugaring happens after type
                 checking and desugarings are guaranteed to only
                 generate well-typed code. A major novelty of our work
                 is that desugarings operate on typing derivations
                 instead of plain syntax trees. This provides
                 desugarings access to typing information and forms the
                 basis for the soundness guarantee we provide, namely
                 that a desugaring generates a valid typing derivation.
                 We have implemented our system for syntactic
                 extensibility in a language-independent fashion and
                 instantiated it for a substantial subset of Java,
                 including generics and inheritance. We provide a sound
                 Java extension for Scala-like for-comprehensions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Padon:2016:DII,
  author =       "Oded Padon and Neil Immerman and Sharon Shoham and
                 Aleksandr Karbyshev and Mooly Sagiv",
  title =        "Decidability of inferring inductive invariants",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "217--231",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837640",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Induction is a successful approach for verification of
                 hardware and software systems. A common practice is to
                 model a system using logical formulas, and then use a
                 decision procedure to verify that some logical formula
                 is an inductive safety invariant for the system. A key
                 ingredient in this approach is coming up with the
                 inductive invariant, which is known as invariant
                 inference. This is a major difficulty, and it is often
                 left for humans or addressed by sound but incomplete
                 abstract interpretation. This paper is motivated by the
                 problem of inductive invariants in shape analysis and
                 in distributed protocols. This paper approaches the
                 general problem of inferring first-order inductive
                 invariants by restricting the language L of candidate
                 invariants. Notice that the problem of invariant
                 inference in a restricted language L differs from the
                 safety problem, since a system may be safe and still
                 not have any inductive invariant in L that proves
                 safety. Clearly, if L is finite (and if testing an
                 inductive invariant is decidable), then inferring
                 invariants in L is decidable. This paper presents some
                 interesting cases when inferring inductive invariants
                 in L is decidable even when L is an infinite language
                 of universal formulas. Decidability is obtained by
                 restricting L and defining a suitable well-quasi-order
                 on the state space. We also present some undecidability
                 results that show that our restrictions are necessary.
                 We further present a framework for systematically
                 constructing infinite languages while keeping the
                 invariant inference problem decidable. We illustrate
                 our approach by showing the decidability of inferring
                 invariants for programs manipulating linked-lists, and
                 for distributed protocols.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Lavaee:2016:HDP,
  author =       "Rahman Lavaee",
  title =        "The hardness of data packing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "232--242",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837669",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A program can benefit from improved cache block
                 utilization when contemporaneously accessed data
                 elements are placed in the same memory block. This can
                 reduce the program's memory block working set and
                 thereby, reduce the capacity miss rate. We formally
                 define the problem of data packing for arbitrary number
                 of blocks in the cache and packing factor (the number
                 of data objects fitting in a cache block) and study how
                 well the optimal solution can be approximated for two
                 dual problems. On the one hand, we show that the cache
                 hit maximization problem is approximable within a
                 constant factor, for every fixed number of blocks in
                 the cache. On the other hand, we show that unless P=NP,
                 the cache miss minimization problem cannot be
                 efficiently approximated.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Gimenez:2016:CI,
  author =       "St{\'e}phane Gimenez and Georg Moser",
  title =        "The complexity of interaction",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "243--255",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837646",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we analyze the complexity of functional
                 programs written in the interaction-net computation
                 model, an asynchronous, parallel and confluent model
                 that generalizes linear-logic proof nets. Employing
                 user-defined sized and scheduled types, we certify
                 concrete time, space and space-time complexity bounds
                 for both sequential and parallel reductions of
                 interaction-net programs by suitably assigning
                 complexity potentials to typed nodes. The relevance of
                 this approach is illustrated on archetypal programming
                 examples. The provided analysis is precise,
                 compositional and is, in theory, not restricted to
                 particular complexity classes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Swamy:2016:DTM,
  author =       "Nikhil Swamy and Catalin Hritcu and Chantal Keller and
                 Aseem Rastogi and Antoine Delignat-Lavaud and Simon
                 Forest and Karthikeyan Bhargavan and C{\'e}dric Fournet
                 and Pierre-Yves Strub and Markulf Kohlweiss and
                 Jean-Karim Zinzindohoue and Santiago
                 Zanella-B{\'e}guelin",
  title =        "Dependent types and multi-monadic effects in {F*}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "256--270",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837655",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new, completely redesigned, version of
                 F*, a language that works both as a proof assistant as
                 well as a general-purpose, verification-oriented,
                 effectful programming language. In support of these
                 complementary roles, F* is a dependently typed,
                 higher-order, call-by-value language with _primitive_
                 effects including state, exceptions, divergence and IO.
                 Although primitive, programmers choose the granularity
                 at which to specify effects by equipping each effect
                 with a monadic, predicate transformer semantics. F*
                 uses this to efficiently compute weakest preconditions
                 and discharges the resulting proof obligations using a
                 combination of SMT solving and manual proofs. Isolated
                 from the effects, the core of F* is a language of pure
                 functions used to write specifications and proof
                 terms---its consistency is maintained by a semantic
                 termination check based on a well-founded order. We
                 evaluate our design on more than 55,000 lines of F* we
                 have authored in the last year, focusing on three main
                 case studies. Showcasing its use as a general-purpose
                 programming language, F* is programmed (but not
                 verified) in F*, and bootstraps in both OCaml and F\#.
                 Our experience confirms F*'s pay-as-you-go cost model:
                 writing idiomatic ML-like code with no finer
                 specifications imposes no user burden. As a
                 verification-oriented language, our most significant
                 evaluation of F* is in verifying several key modules in
                 an implementation of the TLS-1.2 protocol standard. For
                 the modules we considered, we are able to prove more
                 properties, with fewer annotations using F* than in a
                 prior verified implementation of TLS-1.2. Finally, as a
                 proof assistant, we discuss our use of F* in
                 mechanizing the metatheory of a range of lambda
                 calculi, starting from the simply typed lambda calculus
                 to System F-omega and even micro-F*, a sizeable
                 fragment of F* itself---these proofs make essential use
                 of F*'s flexible combination of SMT automation and
                 constructive proofs, enabling a tactic-free style of
                 programming and proving at a relatively large scale.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Borgstrom:2016:FRF,
  author =       "Johannes Borgstr{\"o}m and Andrew D. Gordon and Long
                 Ouyang and Claudio Russo and Adam {\'S}cibior and
                 Marcin Szymczak",
  title =        "{Fabular}: regression formulas as probabilistic
                 programming",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "271--283",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837653",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Regression formulas are a domain-specific language
                 adopted by several R packages for describing an
                 important and useful class of statistical models:
                 hierarchical linear regressions. Formulas are succinct,
                 expressive, and clearly popular, so are they a useful
                 addition to probabilistic programming languages? And
                 what do they mean? We propose a core calculus of
                 hierarchical linear regression, in which regression
                 coefficients are themselves defined by nested
                 regressions (unlike in R). We explain how our calculus
                 captures the essence of the formula DSL found in R. We
                 describe the design and implementation of Fabular, a
                 version of the Tabular schema-driven probabilistic
                 programming language, enriched with formulas based on
                 our regression calculus. To the best of our knowledge,
                 this is the first formal description of the core ideas
                 of R's formula notation, the first development of a
                 calculus of regression formulas, and the first
                 demonstration of the benefits of composing regression
                 formulas and latent variables in a probabilistic
                 programming language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Grathwohl:2016:KCN,
  author =       "Bj{\o}rn Bugge Grathwohl and Fritz Henglein and Ulrik
                 Terp Rasmussen and Kristoffer Aalund S{\o}holm and
                 Sebastian Paaske T{\o}rholm",
  title =        "{Kleenex}: compiling nondeterministic transducers to
                 deterministic streaming transducers",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "284--297",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837647",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present and illustrate Kleenex, a language for
                 expressing general nondeterministic finite transducers,
                 and its novel compilation to streaming string
                 transducers with essentially optimal streaming
                 behavior, worst-case linear-time performance and
                 sustained high throughput. Its underlying theory is
                 based on transducer decomposition into oracle and
                 action machines: the oracle machine performs streaming
                 greedy disambiguation of the input; the action machine
                 performs the output actions. In use cases Kleenex
                 achieves consistently high throughput rates around the
                 1 Gbps range on stock hardware. It performs well,
                 especially in complex use cases, in comparison to both
                 specialized and related tools such as GNUawk, GNUsed,
                 GNUgrep, RE2, Ragel and regular-expression libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Long:2016:APG,
  author =       "Fan Long and Martin Rinard",
  title =        "Automatic patch generation by learning correct code",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "298--312",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837617",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Prophet, a novel patch generation system
                 that works with a set of successful human patches
                 obtained from open- source software repositories to
                 learn a probabilistic, application-independent model of
                 correct code. It generates a space of candidate
                 patches, uses the model to rank the candidate patches
                 in order of likely correctness, and validates the
                 ranked patches against a suite of test cases to find
                 correct patches. Experimental results show that, on a
                 benchmark set of 69 real-world defects drawn from eight
                 open-source projects, Prophet significantly outperforms
                 the previous state-of-the-art patch generation
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Katz:2016:ETB,
  author =       "Omer Katz and Ran El-Yaniv and Eran Yahav",
  title =        "Estimating types in binaries using predictive
                 modeling",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "313--326",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837674",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reverse engineering is an important tool in mitigating
                 vulnerabilities in binaries. As a lot of software is
                 developed in object-oriented languages, reverse
                 engineering of object-oriented code is of critical
                 importance. One of the major hurdles in reverse
                 engineering binaries compiled from object-oriented code
                 is the use of dynamic dispatch. In the absence of debug
                 information, any dynamic dispatch may seem to jump to
                 many possible targets, posing a significant challenge
                 to a reverse engineer trying to track the program flow.
                 We present a novel technique that allows us to
                 statically determine the likely targets of virtual
                 function calls. Our technique uses object tracelets ---
                 statically constructed sequences of operations
                 performed on an object --- to capture potential runtime
                 behaviors of the object. Our analysis automatically
                 pre-labels some of the object tracelets by relying on
                 instances where the type of an object is known. The
                 resulting type-labeled tracelets are then used to train
                 a statistical language model (SLM) for each type.We
                 then use the resulting ensemble of SLMs over unlabeled
                 tracelets to generate a ranking of their most likely
                 types, from which we deduce the likely targets of
                 dynamic dispatches.We have implemented our technique
                 and evaluated it over real-world C++ binaries. Our
                 evaluation shows that when there are multiple
                 alternative targets, our approach can drastically
                 reduce the number of targets that have to be considered
                 by a reverse engineer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Chatterjee:2016:AAQ,
  author =       "Krishnendu Chatterjee and Hongfei Fu and Petr
                 Novotn{\'y} and Rouzbeh Hasheminezhad",
  title =        "Algorithmic analysis of qualitative and quantitative
                 termination problems for affine probabilistic
                 programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "327--342",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837639",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we consider termination of
                 probabilistic programs with real-valued variables. The
                 questions concerned are: 1. qualitative ones that ask
                 (i) whether the program terminates with probability 1
                 (almost-sure termination) and (ii) whether the expected
                 termination time is finite (finite termination); 2.
                 quantitative ones that ask (i) to approximate the
                 expected termination time (expectation problem) and
                 (ii) to compute a bound B such that the probability to
                 terminate after B steps decreases exponentially
                 (concentration problem). To solve these questions, we
                 utilize the notion of ranking supermartingales which is
                 a powerful approach for proving termination of
                 probabilistic programs. In detail, we focus on
                 algorithmic synthesis of linear
                 ranking-supermartingales over affine probabilistic
                 programs (APP's) with both angelic and demonic
                 non-determinism. An important subclass of APP's is
                 LRAPP which is defined as the class of all APP's over
                 which a linear ranking-supermartingale exists. Our main
                 contributions are as follows. Firstly, we show that the
                 membership problem of LRAPP (i) can be decided in
                 polynomial time for APP's with at most demonic
                 non-determinism, and (ii) is NP-hard and in PSPACE for
                 APP's with angelic non-determinism; moreover, the
                 NP-hardness result holds already for APP's without
                 probability and demonic non-determinism. Secondly, we
                 show that the concentration problem over LRAPP can be
                 solved in the same complexity as for the membership
                 problem of LRAPP. Finally, we show that the expectation
                 problem over LRAPP can be solved in 2EXPTIME and is
                 PSPACE-hard even for APP's without probability and
                 non-determinism (i.e., deterministic programs). Our
                 experimental results demonstrate the effectiveness of
                 our approach to answer the qualitative and quantitative
                 questions over APP's with at most demonic
                 non-determinism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Singh:2016:TSD,
  author =       "Rishabh Singh and Sumit Gulwani",
  title =        "Transforming spreadsheet data types using examples",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "343--356",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837668",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cleaning spreadsheet data types is a common problem
                 faced by millions of spreadsheet users. Data types such
                 as date, time, name, and units are ubiquitous in
                 spreadsheets, and cleaning transformations on these
                 data types involve parsing and pretty printing their
                 string representations. This presents many challenges
                 to users because cleaning such data requires some
                 background knowledge about the data itself and moreover
                 this data is typically non-uniform, unstructured, and
                 ambiguous. Spreadsheet systems and Programming
                 Languages provide some UI-based and programmatic
                 solutions for this problem but they are either
                 insufficient for the user's needs or are beyond their
                 expertise. In this paper, we present a programming by
                 example methodology of cleaning data types that learns
                 the desired transformation from a few input-output
                 examples. We propose a domain specific language with
                 probabilistic semantics that is parameterized with
                 declarative data type definitions. The probabilistic
                 semantics is based on three key aspects: (i)
                 approximate predicate matching, (ii) joint learning of
                 data type interpretation, and (iii) weighted branches.
                 This probabilistic semantics enables the language to
                 handle non-uniform, unstructured, and ambiguous data.
                 We then present a synthesis algorithm that learns the
                 desired program in this language from a set of
                 input-output examples. We have implemented our
                 algorithm as an Excel add-in and present its successful
                 evaluation on 55 benchmark problems obtained from
                 online help forums and Excel product team.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Lesani:2016:CCC,
  author =       "Mohsen Lesani and Christian J. Bell and Adam
                 Chlipala",
  title =        "{Chapar}: certified causally consistent distributed
                 key-value stores",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "357--370",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837622",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today's Internet services are often expected to stay
                 available and render high responsiveness even in the
                 face of site crashes and network partitions.
                 Theoretical results state that causal consistency is
                 one of the strongest consistency guarantees that is
                 possible under these requirements, and many practical
                 systems provide causally consistent key-value stores.
                 In this paper, we present a framework called Chapar for
                 modular verification of causal consistency for
                 replicated key-value store implementations and their
                 client programs. Specifically, we formulate separate
                 correctness conditions for key-value store
                 implementations and for their clients. The interface
                 between the two is a novel operational semantics for
                 causal consistency. We have verified the causal
                 consistency of two key-value store implementations from
                 the literature using a novel proof technique. We have
                 also implemented a simple automatic model checker for
                 the correctness of client programs. The two
                 independently verified results for the implementations
                 and clients can be composed to conclude the correctness
                 of any of the programs when executed with any of the
                 implementations. We have developed and checked our
                 framework in Coq, extracted it to OCaml, and built
                 executable stores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Gotsman:2016:CIS,
  author =       "Alexey Gotsman and Hongseok Yang and Carla Ferreira
                 and Mahsa Najafzadeh and Marc Shapiro",
  title =        "'{Cause} {I}'m strong enough': Reasoning about
                 consistency choices in distributed systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "371--384",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837625",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large-scale distributed systems often rely on
                 replicated databases that allow a programmer to request
                 different data consistency guarantees for different
                 operations, and thereby control their performance.
                 Using such databases is far from trivial: requesting
                 stronger consistency in too many places may hurt
                 performance, and requesting it in too few places may
                 violate correctness. To help programmers in this task,
                 we propose the first proof rule for establishing that a
                 particular choice of consistency guarantees for various
                 operations on a replicated database is enough to ensure
                 the preservation of a given data integrity invariant.
                 Our rule is modular: it allows reasoning about the
                 behaviour of every operation separately under some
                 assumption on the behaviour of other operations. This
                 leads to simple reasoning, which we have automated in
                 an SMT-based tool. We present a nontrivial proof of
                 soundness of our rule and illustrate its use on several
                 examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Liang:2016:PLC,
  author =       "Hongjin Liang and Xinyu Feng",
  title =        "A program logic for concurrent objects under fair
                 scheduling",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "385--399",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837635",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Existing work on verifying concurrent objects is
                 mostly concerned with safety only, e.g., partial
                 correctness or linearizability. Although there has been
                 recent work verifying lock-freedom of non-blocking
                 objects, much less efforts are focused on
                 deadlock-freedom and starvation-freedom, progress
                 properties of blocking objects. These properties are
                 more challenging to verify than lock-freedom because
                 they allow the progress of one thread to depend on the
                 progress of another, assuming fair scheduling. We
                 propose LiLi, a new rely-guarantee style program logic
                 for verifying linearizability and progress together for
                 concurrent objects under fair scheduling. The
                 rely-guarantee style logic unifies thread-modular
                 reasoning about both starvation-freedom and
                 deadlock-freedom in one framework. It also establishes
                 progress-aware abstraction for concurrent objects,
                 which can be applied when verifying safety and liveness
                 of client code. We have successfully applied the logic
                 to verify starvation-freedom or deadlock-freedom of
                 representative algorithms such as ticket locks, queue
                 locks, lock-coupling lists, optimistic lists and lazy
                 lists.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Dragoi:2016:PPS,
  author =       "Cezara Dragoi and Thomas A. Henzinger and Damien
                 Zufferey",
  title =        "{PSync}: a partially synchronous language for
                 fault-tolerant distributed algorithms",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "400--415",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837650",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Fault-tolerant distributed algorithms play an
                 important role in many critical/high-availability
                 applications. These algorithms are notoriously
                 difficult to implement correctly, due to asynchronous
                 communication and the occurrence of faults, such as the
                 network dropping messages or computers crashing. We
                 introduce PSync, a domain specific language based on
                 the Heard-Of model, which views asynchronous faulty
                 systems as synchronous ones with an adversarial
                 environment that simulates asynchrony and faults by
                 dropping messages. We define a runtime system for PSync
                 that efficiently executes on asynchronous networks. We
                 formalise the relation between the runtime system and
                 PSync in terms of observational refinement. The
                 high-level lockstep abstraction introduced by PSync
                 simplifies the design and implementation of
                 fault-tolerant distributed algorithms and enables
                 automated formal verification. We have implemented an
                 embedding of PSync in the Scala programming language
                 with a runtime system for partially synchronous
                 networks. We show the applicability of PSync by
                 implementing several important fault-tolerant
                 distributed algorithms and we compare the
                 implementation of consensus algorithms in PSync against
                 implementations in other languages in terms of code
                 size, runtime efficiency, and verification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Chen:2016:PTI,
  author =       "Sheng Chen and Martin Erwig",
  title =        "Principal type inference for {GADTs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "416--428",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837665",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new method for GADT type inference that
                 improves the precision of previous approaches. In
                 particular, our approach accepts more type-correct
                 programs than previous approaches when they do not
                 employ type annotations. A side benefit of our approach
                 is that it can detect a wide range of runtime errors
                 that are missed by previous approaches. Our method is
                 based on the idea to represent type refinements in
                 pattern-matching branches by choice types, which
                 facilitate a separation of the typing and
                 reconciliation phases and thus support case
                 expressions. This idea is formalized in a type system,
                 which is both sound and a conservative extension of the
                 classical Hindley--Milner system. We present the
                 results of an empirical evaluation that compares our
                 algorithm with previous approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Garcia:2016:AGT,
  author =       "Ronald Garcia and Alison M. Clark and {\'E}ric
                 Tanter",
  title =        "Abstracting gradual typing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "429--442",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837670",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Language researchers and designers have extended a
                 wide variety of type systems to support gradual typing,
                 which enables languages to seamlessly combine dynamic
                 and static checking. These efforts consistently
                 demonstrate that designing a satisfactory gradual
                 counterpart to a static type system is challenging, and
                 this challenge only increases with the sophistication
                 of the type system. Gradual type system designers need
                 more formal tools to help them conceptualize,
                 structure, and evaluate their designs. In this paper,
                 we propose a new formal foundation for gradual typing,
                 drawing on principles from abstract interpretation to
                 give gradual types a semantics in terms of pre-existing
                 static types. Abstracting Gradual Typing (AGT for
                 short) yields a formal account of consistency---one of
                 the cornerstones of the gradual typing approach---that
                 subsumes existing notions of consistency, which were
                 developed through intuition and ad hoc reasoning. Given
                 a syntax-directed static typing judgment, the AGT
                 approach induces a corresponding gradual typing
                 judgment. Then the type safety proof for the underlying
                 static discipline induces a dynamic semantics for
                 gradual programs defined over source-language typing
                 derivations. The AGT approach does not resort to an
                 externally justified cast calculus: instead, run-time
                 checks naturally arise by deducing evidence for
                 consistent judgments during proof reduction. To
                 illustrate the approach, we develop a novel
                 gradually-typed counterpart for a language with record
                 subtyping. Gradual languages designed with the AGT
                 approach satisfy by construction the refined criteria
                 for gradual typing set forth by Siek and colleagues.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Cimini:2016:GMA,
  author =       "Matteo Cimini and Jeremy G. Siek",
  title =        "The gradualizer: a methodology and algorithm for
                 generating gradual type systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "443--455",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837632",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many languages are beginning to integrate dynamic and
                 static typing. Siek and Taha offered gradual typing as
                 an approach to this integration that provides a
                 coherent and full-span migration between the two
                 disciplines. However, the literature lacks a general
                 methodology for designing gradually typed languages.
                 Our first contribution is to provide a methodology for
                 deriving the gradual type system and the compilation to
                 the cast calculus. Based on this methodology, we
                 present the Gradualizer, an algorithm that generates a
                 gradual type system from a well-formed type system and
                 also generates a compiler to the cast calculus. Our
                 algorithm handles a large class of type systems and
                 generates systems that are correct with respect to the
                 formal criteria of gradual typing. We also report on an
                 implementation of the Gradualizer that takes a type
                 system expressed in lambda-prolog and outputs its
                 gradually typed version and a compiler to the cast
                 calculus in lambda-prolog.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Takikawa:2016:SGT,
  author =       "Asumu Takikawa and Daniel Feltey and Ben Greenman and
                 Max S. New and Jan Vitek and Matthias Felleisen",
  title =        "Is sound gradual typing dead?",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "456--468",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837630",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmers have come to embrace dynamically-typed
                 languages for prototyping and delivering large and
                 complex systems. When it comes to maintaining and
                 evolving these systems, the lack of explicit static
                 typing becomes a bottleneck. In response, researchers
                 have explored the idea of gradually-typed programming
                 languages which allow the incremental addition of type
                 annotations to software written in one of these untyped
                 languages. Some of these new, hybrid languages insert
                 run-time checks at the boundary between typed and
                 untyped code to establish type soundness for the
                 overall system. With sound gradual typing, programmers
                 can rely on the language implementation to provide
                 meaningful error messages when type invariants are
                 violated. While most research on sound gradual typing
                 remains theoretical, the few emerging implementations
                 suffer from performance overheads due to these checks.
                 None of the publications on this topic comes with a
                 comprehensive performance evaluation. Worse, a few
                 report disastrous numbers. In response, this paper
                 proposes a method for evaluating the performance of
                 gradually-typed programming languages. The method
                 hinges on exploring the space of partial conversions
                 from untyped to typed. For each benchmark, the
                 performance of the different versions is reported in a
                 synthetic metric that associates runtime overhead to
                 conversion effort. The paper reports on the results of
                 applying the method to Typed Racket, a mature
                 implementation of sound gradual typing, using a suite
                 of real-world programs of various sizes and
                 complexities. Based on these results the paper
                 concludes that, given the current state of
                 implementation technologies, sound gradual typing faces
                 significant challenges. Conversely, it raises the
                 question of how implementations could reduce the
                 overheads associated with soundness and how tools could
                 be used to steer programmers clear from pathological
                 cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Octeau:2016:CSA,
  author =       "Damien Octeau and Somesh Jha and Matthew Dering and
                 Patrick McDaniel and Alexandre Bartel and Li Li and
                 Jacques Klein and Yves {Le Traon}",
  title =        "Combining static analysis with probabilistic models to
                 enable market-scale {Android} inter-component
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "469--484",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837661",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Static analysis has been successfully used in many
                 areas, from verifying mission-critical software to
                 malware detection. Unfortunately, static analysis often
                 produces false positives, which require significant
                 manual effort to resolve. In this paper, we show how to
                 overlay a probabilistic model, trained using domain
                 knowledge, on top of static analysis results, in order
                 to triage static analysis results. We apply this idea
                 to analyzing mobile applications. Android application
                 components can communicate with each other, both within
                 single applications and between different applications.
                 Unfortunately, techniques to statically infer
                 Inter-Component Communication (ICC) yield many
                 potential inter-component and inter-application links,
                 most of which are false positives. At large scales,
                 scrutinizing all potential links is simply not
                 feasible. We therefore overlay a probabilistic model of
                 ICC on top of static analysis results. Since computing
                 the inter-component links is a prerequisite to
                 inter-component analysis, we introduce a formalism for
                 inferring ICC links based on set constraints. We design
                 an efficient algorithm for performing link resolution.
                 We compute all potential links in a corpus of 11,267
                 applications in 30 minutes and triage them using our
                 probabilistic approach. We find that over 95.1\% of all
                 636 million potential links are associated with
                 probability values below 0.01 and are thus likely
                 unfeasible links. Thus, it is possible to consider only
                 a small subset of all links without significant loss of
                 information. This work is the first significant step in
                 making static inter-application analysis more
                 tractable, even at large scales.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Grigore:2016:ARG,
  author =       "Radu Grigore and Hongseok Yang",
  title =        "Abstraction refinement guided by a learnt
                 probabilistic model",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "485--498",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837663",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The core challenge in designing an effective static
                 program analysis is to find a good program abstraction
                 --- one that retains only details relevant to a given
                 query. In this paper, we present a new approach for
                 automatically finding such an abstraction. Our approach
                 uses a pessimistic strategy, which can optionally use
                 guidance from a probabilistic model. Our approach
                 applies to parametric static analyses implemented in
                 Datalog, and is based on counterexample-guided
                 abstraction refinement. For each untried abstraction,
                 our probabilistic model provides a probability of
                 success, while the size of the abstraction provides an
                 estimate of its cost in terms of analysis time.
                 Combining these two metrics, probability and cost, our
                 refinement algorithm picks an optimal abstraction. Our
                 probabilistic model is a variant of the Erdos--Renyi
                 random graph model, and it is tunable by what we call
                 hyperparameters. We present a method to learn good
                 values for these hyperparameters, by observing past
                 runs of the analysis on an existing codebase. We
                 evaluate our approach on an object sensitive pointer
                 analysis for Java programs, with two client analyses
                 (PolySite and Downcast).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Garg:2016:LIU,
  author =       "Pranav Garg and Daniel Neider and P. Madhusudan and
                 Dan Roth",
  title =        "Learning invariants using decision trees and
                 implication counterexamples",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "499--512",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837664",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Inductive invariants can be robustly synthesized using
                 a learning model where the teacher is a program
                 verifier who instructs the learner through concrete
                 program configurations, classified as positive,
                 negative, and implications. We propose the first
                 learning algorithms in this model with implication
                 counter-examples that are based on machine learning
                 techniques. In particular, we extend classical
                 decision-tree learning algorithms in machine learning
                 to handle implication samples, building new scalable
                 ways to construct small decision trees using
                 statistical measures. We also develop a decision-tree
                 learning algorithm in this model that is guaranteed to
                 converge to the right concept (invariant) if one
                 exists. We implement the learners and an appropriate
                 teacher, and show that the resulting invariant
                 synthesis is efficient and convergent for a large suite
                 of programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Emmi:2016:SAD,
  author =       "Michael Emmi and Constantin Enea",
  title =        "Symbolic abstract data type inference",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "513--525",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837645",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Formal specification is a vital ingredient to scalable
                 verification of software systems. In the case of
                 efficient implementations of concurrent objects like
                 atomic registers, queues, and locks, symbolic formal
                 representations of their abstract data types (ADTs)
                 enable efficient modular reasoning, decoupling clients
                 from implementations. Writing adequate formal
                 specifications, however, is a complex task requiring
                 rare expertise. In practice, programmers write
                 reference implementations as informal specifications.
                 In this work we demonstrate that effective symbolic ADT
                 representations can be automatically generated from the
                 executions of reference implementations. Our approach
                 exploits two key features of naturally-occurring ADTs:
                 violations can be decomposed into a small set of
                 representative patterns, and these patterns manifest in
                 executions with few operations. By identifying certain
                 algebraic properties of naturally-occurring ADTs, and
                 exhaustively sampling executions up to a small number
                 of operations, we generate concise symbolic ADT
                 representations which are complete in practice,
                 enabling the application of efficient symbolic
                 verification algorithms without the burden of manual
                 specification. Furthermore, the concise ADT violation
                 patterns we generate are human-readable, and can serve
                 as useful, formal documentation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Bhaskaracharya:2016:SIA,
  author =       "Somashekaracharya G. Bhaskaracharya and Uday
                 Bondhugula and Albert Cohen",
  title =        "{SMO}: an integrated approach to intra-array and
                 inter-array storage optimization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "526--538",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837636",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The polyhedral model provides an expressive
                 intermediate representation that is convenient for the
                 analysis and subsequent transformation of affine loop
                 nests. Several heuristics exist for achieving complex
                 program transformations in this model. However, there
                 is also considerable scope to utilize this model to
                 tackle the problem of automatic memory footprint
                 optimization. In this paper, we present a new automatic
                 storage optimization technique which can be used to
                 achieve both intra-array as well as inter-array storage
                 reuse with a pre-determined schedule for the
                 computation. Our approach works by finding
                 statement-wise storage partitioning hyperplanes that
                 partition a unified global array space so that values
                 with overlapping live ranges are not mapped to the same
                 partition. Our heuristic is driven by a fourfold
                 objective function which not only minimizes the
                 dimensionality and storage requirements of arrays
                 required for each high-level statement, but also
                 maximizes inter-statement storage reuse. The storage
                 mappings obtained using our heuristic can be
                 asymptotically better than those obtained by any
                 existing technique. We implement our technique and
                 demonstrate its practical impact by evaluating its
                 effectiveness on several benchmarks chosen from the
                 domains of image processing, stencil computations, and
                 high-performance computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Bao:2016:PDV,
  author =       "Wenlei Bao and Sriram Krishnamoorthy and
                 Louis-No{\"e}l Pouchet and Fabrice Rastello and P.
                 Sadayappan",
  title =        "{PolyCheck}: dynamic verification of iteration space
                 transformations on affine programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "539--554",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837656",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "High-level compiler transformations, especially loop
                 transformations, are widely recognized as critical
                 optimizations to restructure programs to improve data
                 locality and expose parallelism. Guaranteeing the
                 correctness of program transformations is essential,
                 and to date three main approaches have been developed:
                 proof of equivalence of affine programs, matching the
                 execution traces of programs, and checking bit-by-bit
                 equivalence of program outputs. Each technique suffers
                 from limitations in the kind of transformations
                 supported, space complexity, or the sensitivity to the
                 testing dataset. In this paper, we take a novel
                 approach that addresses all three limitations to
                 provide an automatic bug checker to verify any
                 iteration reordering transformations on affine
                 programs, including non-affine transformations, with
                 space consumption proportional to the original program
                 data and robust to arbitrary datasets of a given size.
                 We achieve this by exploiting the structure of affine
                 program control- and data-flow to generate at
                 compile-time lightweight checker code to be executed
                 within the transformed program. Experimental results
                 assess the correctness and effectiveness of our method
                 and its increased coverage over previous approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Andrysco:2016:PFP,
  author =       "Marc Andrysco and Ranjit Jhala and Sorin Lerner",
  title =        "Printing floating-point numbers: a faster, always
                 correct method",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "555--567",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837654",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Floating-point numbers are an essential part of modern
                 software, recently gaining particular prominence on the
                 web as the exclusive numeric format of Javascript. To
                 use floating-point numbers, we require a way to convert
                 binary machine representations into human readable
                 decimal outputs. Existing conversion algorithms make
                 trade-offs between completeness and performance. The
                 classic Dragon4 algorithm by Steele and White and its
                 later refinements achieve completeness --- i.e. produce
                 correct and optimal outputs on all inputs --- by using
                 arbitrary precision integer (bignum) arithmetic which
                 leads to a high performance cost. On the other hand,
                 the recent Grisu3 algorithm by Loitsch shows how to
                 recover performance by using native integer arithmetic
                 but sacrifices optimality for 0.5\% of all inputs. We
                 present Errol, a new complete algorithm that is
                 guaranteed to produce correct and optimal results for
                 all inputs while simultaneously being 2x faster than
                 the incomplete Grisu3 and 4x faster than previous
                 complete methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Orchard:2016:ESS,
  author =       "Dominic Orchard and Nobuko Yoshida",
  title =        "Effects as sessions, sessions as effects",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "568--581",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837634",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Effect and session type systems are two expressive
                 behavioural type systems. The former is usually
                 developed in the context of the lambda-calculus and its
                 variants, the latter for the pi-calculus. In this paper
                 we explore their relative expressive power. Firstly, we
                 give an embedding from PCF, augmented with a
                 parameterised effect system, into a session-typed
                 pi-calculus (session calculus), showing that session
                 types are powerful enough to express effects. Secondly,
                 we give a reverse embedding, from the session calculus
                 back into PCF, by instantiating PCF with concurrency
                 primitives and its effect system with a session-like
                 effect algebra; effect systems are powerful enough to
                 express sessions. The embedding of session types into
                 an effect system is leveraged to give a new
                 implementation of session types in Haskell, via an
                 effect system encoding. The correctness of this
                 implementation follows from the second embedding
                 result. We also discuss various extensions to our
                 embeddings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Jia:2016:MBA,
  author =       "Limin Jia and Hannah Gommerstadt and Frank Pfenning",
  title =        "Monitors and blame assignment for higher-order session
                 types",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "582--594",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837662",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Session types provide a means to prescribe the
                 communication behavior between concurrent
                 message-passing processes. However, in a distributed
                 setting, some processes may be written in languages
                 that do not support static typing of sessions or may be
                 compromised by a malicious intruder, violating
                 invariants of the session types. In such a setting,
                 dynamically monitoring communication between processes
                 becomes a necessity for identifying undesirable
                 actions. In this paper, we show how to dynamically
                 monitor communication to enforce adherence to session
                 types in a higher-order setting. We present a system of
                 blame assignment in the case when the monitor detects
                 an undesirable action and an alarm is raised. We prove
                 that dynamic monitoring does not change system behavior
                 for welltyped processes, and that one of an indicated
                 set of possible culprits must have been compromised in
                 case of an alarm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Sangiorgi:2016:EBP,
  author =       "Davide Sangiorgi and Valeria Vignudelli",
  title =        "Environmental bisimulations for probabilistic
                 higher-order languages",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "595--607",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837651",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Environmental bisimulations for probabilistic
                 higher-order languages are studied. In contrast with
                 applicative bisimulations, environmental bisimulations
                 are known to be more robust and do not require
                 sophisticated techniques such as Howe's in the proofs
                 of congruence. As representative calculi, call-by-name
                 and call-by-value \lambda - calculus, and a
                 (call-by-value) \lambda -calculus extended with
                 references (i.e., a store) are considered. In each case
                 full abstraction results are derived for probabilistic
                 environmental similarity and bisimilarity with respect
                 to contextual preorder and contextual equivalence,
                 respectively. Some possible enhancements of the
                 (bi)simulations, as `up-to techniques', are also
                 presented. Probabilities force a number of
                 modifications to the definition of environmental
                 bisimulations in non-probabilistic languages. Some of
                 these modifications are specific to probabilities,
                 others may be seen as general refinements of
                 environmental bisimulations, applicable also to
                 non-probabilistic languages. Several examples are
                 presented, to illustrate the modifications and the
                 differences.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Flur:2016:MAA,
  author =       "Shaked Flur and Kathryn E. Gray and Christopher Pulte
                 and Susmit Sarkar and Ali Sezgin and Luc Maranget and
                 Will Deacon and Peter Sewell",
  title =        "Modelling the {ARMv8} architecture, operationally:
                 concurrency and {ISA}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "608--621",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837615",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper we develop semantics for key aspects of
                 the ARMv8 multiprocessor architecture: the concurrency
                 model and much of the 64-bit application-level
                 instruction set (ISA). Our goal is to clarify what the
                 range of architecturally allowable behaviour is, and
                 thereby to support future work on formal verification,
                 analysis, and testing of concurrent ARM software and
                 hardware. Establishing such models with high confidence
                 is intrinsically difficult: it involves capturing the
                 vendor's architectural intent, aspects of which
                 (especially for concurrency) have not previously been
                 precisely defined. We therefore first develop a
                 concurrency model with a microarchitectural flavour,
                 abstracting from many hardware implementation concerns
                 but still close to hardware-designer intuition. This
                 means it can be discussed in detail with ARM
                 architects. We then develop a more abstract model,
                 better suited for use as an architectural
                 specification, which we prove sound w.r.t.~the first.
                 The instruction semantics involves further
                 difficulties, handling the mass of detail and the
                 subtle intensional information required to interface to
                 the concurrency model. We have a novel ISA description
                 language, with a lightweight dependent type system,
                 letting us do both with a rather direct representation
                 of the ARM reference manual instruction descriptions.
                 We build a tool from the combined semantics that lets
                 one explore, either interactively or exhaustively, the
                 full range of architecturally allowed behaviour, for
                 litmus tests and (small) ELF executables. We prove
                 correctness of some optimisations needed for tool
                 performance. We validate the models by discussion with
                 ARM staff, and by comparison against ARM hardware
                 behaviour, for ISA single- instruction tests and
                 concurrent litmus tests.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Pichon-Pharabod:2016:CSR,
  author =       "Jean Pichon-Pharabod and Peter Sewell",
  title =        "A concurrency semantics for relaxed atomics that
                 permits optimisation and avoids thin-air executions",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "622--633",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837616",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite much research on concurrent programming
                 languages, especially for Java and C/C++, we still do
                 not have a satisfactory definition of their semantics,
                 one that admits all common optimisations without also
                 admitting undesired behaviour. Especially problematic
                 are the ``thin-air'' examples involving
                 high-performance concurrent accesses, such as C/C++11
                 relaxed atomics. The C/C++11 model is in a
                 per-candidate-execution style, and previous work has
                 identified a tension between that and the fact that
                 compiler optimisations do not operate over single
                 candidate executions in isolation; rather, they operate
                 over syntactic representations that represent all
                 executions. In this paper we propose a novel approach
                 that circumvents this difficulty. We define a
                 concurrency semantics for a core calculus, including
                 relaxed-atomic and non-atomic accesses, and locks, that
                 admits a wide range of optimisation while still
                 forbidding the classic thin-air examples. It also
                 addresses other problems relating to undefined
                 behaviour. The basic idea is to use an event-structure
                 representation of the current state of each thread,
                 capturing all of its potential executions, and to
                 permit interleaving of execution and transformation
                 steps over that to reflect optimisation (possibly
                 dynamic) of the code. These are combined with a
                 non-multi-copy-atomic storage subsystem, to reflect
                 common hardware behaviour. The semantics is defined in
                 a mechanised and executable form, and designed to be
                 implementable above current relaxed hardware and strong
                 enough to support the programming idioms that C/C++11
                 does for this fragment. It offers a potential way
                 forward for concurrent programming language semantics,
                 beyond the current C/C++11 and Java models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Batty:2016:OSA,
  author =       "Mark Batty and Alastair F. Donaldson and John
                 Wickerson",
  title =        "Overhauling {SC} atomics in {C11} and {OpenCL}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "634--648",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837637",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the conceptual simplicity of sequential
                 consistency (SC), the semantics of SC atomic operations
                 and fences in the C11 and OpenCL memory models is
                 subtle, leading to convoluted prose descriptions that
                 translate to complex axiomatic formalisations. We
                 conduct an overhaul of SC atomics in C11, reducing the
                 associated axioms in both number and complexity. A
                 consequence of our simplification is that the SC
                 operations in an execution no longer need to be totally
                 ordered. This relaxation enables, for the first time,
                 efficient and exhaustive simulation of litmus tests
                 that use SC atomics. We extend our improved C11 model
                 to obtain the first rigorous memory model formalisation
                 for OpenCL (which extends C11 with support for
                 heterogeneous many-core programming). In the OpenCL
                 setting, we refine the SC axioms still further to give
                 a sensible semantics to SC operations that employ a
                 `memory scope' to restrict their visibility to specific
                 threads. Our overhaul requires slight strengthenings of
                 both the C11 and the OpenCL memory models, causing some
                 behaviours to become disallowed. We argue that these
                 strengthenings are natural, and that all of the
                 formalised C11 and OpenCL compilation schemes of which
                 we are aware (Power and x86 CPUs for C11, AMD GPUs for
                 OpenCL) remain valid in our revised models. Using the
                 HERD memory model simulator, we show that our overhaul
                 leads to an exponential improvement in simulation time
                 for C11 litmus tests compared with the original model,
                 making *exhaustive* simulation competitive, time-wise,
                 with the *non-exhaustive* CDSChecker tool.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Lahav:2016:TRA,
  author =       "Ori Lahav and Nick Giannarakis and Viktor Vafeiadis",
  title =        "Taming release-acquire consistency",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "649--662",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837643",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a strengthening of the release-acquire
                 fragment of the C11 memory model that (i) forbids
                 dubious behaviors that are not observed in any
                 implementation; (ii) supports fence instructions that
                 restore sequential consistency; and (iii) admits an
                 equivalent intuitive operational semantics based on
                 point-to-point communication. This strengthening has no
                 additional implementation cost: it allows the same
                 local optimizations as C11 release and acquire
                 accesses, and has exactly the same compilation schemes
                 to the x86-TSO and Power architectures. In fact, the
                 compilation to Power is complete with respect to a
                 recent axiomatic model of Power; that is, the compiled
                 program exhibits exactly the same behaviors as the
                 source one. Moreover, we provide criteria for placing
                 enough fence instructions to ensure sequential
                 consistency, and apply them to an efficient RCU
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Reps:2016:NPA,
  author =       "Thomas Reps and Emma Turetsky and Prathmesh Prabhu",
  title =        "{Newtonian} program analysis via tensor product",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "663--677",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837659",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recently, Esparza et al. generalized Newton's method
                 --- a numerical-analysis algorithm for finding roots of
                 real-valued functions---to a method for finding
                 fixed-points of systems of equations over semirings.
                 Their method provides a new way to solve
                 interprocedural dataflow-analysis problems. As in its
                 real-valued counterpart, each iteration of their method
                 solves a simpler ``linearized'' problem. One of the
                 reasons this advance is exciting is that some numerical
                 analysts have claimed that ```all' effective and fast
                 iterative [numerical] methods are forms (perhaps very
                 disguised) of Newton's method.'' However, there is an
                 important difference between the dataflow-analysis and
                 numerical-analysis contexts: when Newton's method is
                 used on numerical-analysis problems, multiplicative
                 commutativity is relied on to rearrange expressions of
                 the form ``c*X + X*d'' into ``(c+d) * X.'' Such
                 equations correspond to path problems described by
                 regular languages. In contrast, when Newton's method is
                 used for interprocedural dataflow analysis, the
                 ``multiplication'' operation involves function
                 composition, and hence is non-commutative: ``c*X +
                 X*d'' cannot be rearranged into ``(c+d) * X.'' Such
                 equations correspond to path problems described by
                 linear context-free languages (LCFLs). In this paper,
                 we present an improved technique for solving the LCFL
                 sub-problems produced during successive rounds of
                 Newton's method. Our method applies to predicate
                 abstraction, on which most of today's software model
                 checkers rely.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Wu:2016:CEA,
  author =       "Rongxin Wu and Xiao Xiao and Shing-Chi Cheung and
                 Hongyu Zhang and Charles Zhang",
  title =        "{Casper}: an efficient approach to call trace
                 collection",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "678--690",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837619",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Call traces, i.e., sequences of function calls and
                 returns, are fundamental to a wide range of program
                 analyses such as bug reproduction, fault diagnosis,
                 performance analysis, and many others. The conventional
                 approach to collect call traces that instruments each
                 function call and return site incurs large space and
                 time overhead. Our approach aims at reducing the
                 recording overheads by instrumenting only a small
                 amount of call sites while keeping the capability of
                 recovering the full trace. We propose a call trace
                 model and a logged call trace model based on an LL(1)
                 grammar, which enables us to define the criteria of a
                 feasible solution to call trace collection. Based on
                 the two models, we prove that to collect call traces
                 with minimal instrumentation is an NP-hard problem. We
                 then propose an efficient approach to obtaining a
                 suboptimal solution. We implemented our approach as a
                 tool Casper and evaluated it using the DaCapo benchmark
                 suite. The experiment results show that our approach
                 causes significantly lower runtime (and space) overhead
                 than two state-of-the-arts approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Gilray:2016:PCF,
  author =       "Thomas Gilray and Steven Lyde and Michael D. Adams and
                 Matthew Might and David {Van Horn}",
  title =        "Pushdown control-flow analysis for free",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "691--704",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837631",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Traditional control-flow analysis (CFA) for
                 higher-order languages introduces spurious connections
                 between callers and callees, and different invocations
                 of a function may pollute each other's return flows.
                 Recently, three distinct approaches have been published
                 that provide perfect call-stack precision in a
                 computable manner: CFA2, PDCFA, and AAC. Unfortunately,
                 implementing CFA2 and PDCFA requires significant
                 engineering effort. Furthermore, all three are
                 computationally expensive. For a monovariant analysis,
                 CFA2 is in O(2^n), PDCFA is in O(n^6), and AAC is in
                 O(n^8). In this paper, we describe a new technique that
                 builds on these but is both straightforward to
                 implement and computationally inexpensive. The crucial
                 insight is an unusual state-dependent allocation
                 strategy for the addresses of continuations. Our
                 technique imposes only a constant-factor overhead on
                 the underlying analysis and costs only O(n^3) in the
                 monovariant case. We present the intuitions behind this
                 development, benchmarks demonstrating its efficacy, and
                 a proof of the precision of this analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Flatt:2016:BSS,
  author =       "Matthew Flatt",
  title =        "Binding as sets of scopes",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "705--717",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837620",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Our new macro expander for Racket builds on a novel
                 approach to hygiene. Instead of basing macro expansion
                 on variable renamings that are mediated by expansion
                 history, our new expander tracks binding through a set
                 of scopes that an identifier acquires from both binding
                 forms and macro expansions. The resulting model of
                 macro expansion is simpler and more uniform than one
                 based on renaming, and it is sufficiently compatible
                 with Racket's old expander to be practical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Hasuo:2016:LTP,
  author =       "Ichiro Hasuo and Shunsuke Shimizu and Corina
                 C{\^\i}rstea",
  title =        "Lattice-theoretic progress measures and coalgebraic
                 model checking",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "718--732",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837673",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the context of formal verification in general and
                 model checking in particular, parity games serve as a
                 mighty vehicle: many problems are encoded as parity
                 games, which are then solved by the seminal algorithm
                 by Jurdzinski. In this paper we identify the essence of
                 this workflow to be the notion of progress measure, and
                 formalize it in general, possibly infinitary,
                 lattice-theoretic terms. Our view on progress measures
                 is that they are to nested/alternating fixed points
                 what invariants are to safety/greatest fixed points,
                 and what ranking functions are to liveness/least fixed
                 points. That is, progress measures are combination of
                 the latter two notions (invariant and ranking function)
                 that have been extensively studied in the context of
                 (program) verification. We then apply our theory of
                 progress measures to a general model-checking
                 framework, where systems are categorically presented as
                 coalgebras. The framework's theoretical robustness is
                 witnessed by a smooth transfer from the branching-time
                 setting to the linear-time one. Although the framework
                 can be used to derive some decision procedures for
                 finite settings, we also expect the proposed framework
                 to form a basis for sound proof methods for some
                 undecidable/infinitary problems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Chatterjee:2016:AAP,
  author =       "Krishnendu Chatterjee and Amir Kafshdar Goharshady and
                 Rasmus Ibsen-Jensen and Andreas Pavlogiannis",
  title =        "Algorithms for algebraic path properties in concurrent
                 systems of constant treewidth components",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "733--747",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837624",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study algorithmic questions for concurrent systems
                 where the transitions are labeled from a complete,
                 closed semiring, and path properties are algebraic with
                 semiring operations. The algebraic path properties can
                 model dataflow analysis problems, the shortest path
                 problem, and many other natural problems that arise in
                 program analysis. We consider that each component of
                 the concurrent system is a graph with constant
                 treewidth, a property satisfied by the controlflow
                 graphs of most programs. We allow for multiple possible
                 queries, which arise naturally in demand driven
                 dataflow analysis. The study of multiple queries allows
                 us to consider the tradeoff between the resource usage
                 of the one-time preprocessing and for each individual
                 query. The traditional approach constructs the product
                 graph of all components and applies the best-known
                 graph algorithm on the product. In this approach, even
                 the answer to a single query requires the transitive
                 closure (i.e., the results of all possible queries),
                 which provides no room for tradeoff between
                 preprocessing and query time. Our main contributions
                 are algorithms that significantly improve the
                 worst-case running time of the traditional approach,
                 and provide various tradeoffs depending on the number
                 of queries. For example, in a concurrent system of two
                 components, the traditional approach requires hexic
                 time in the worst case for answering one query as well
                 as computing the transitive closure, whereas we show
                 that with one-time preprocessing in almost cubic time,
                 each subsequent query can be answered in at most linear
                 time, and even the transitive closure can be computed
                 in almost quartic time. Furthermore, we establish
                 conditional optimality results showing that the
                 worst-case running time of our algorithms cannot be
                 improved without achieving major breakthroughs in graph
                 algorithms (i.e., improving the worst-case bound for
                 the shortest path problem in general graphs).
                 Preliminary experimental results show that our
                 algorithms perform favorably on several benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Muroya:2016:MGI,
  author =       "Koko Muroya and Naohiko Hoshino and Ichiro Hasuo",
  title =        "Memoryful geometry of interaction {II}: recursion and
                 adequacy",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "748--760",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837672",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A general framework of Memoryful Geometry of
                 Interaction (mGoI) is introduced recently by the
                 authors. It provides a sound translation of
                 lambda-terms (on the high-level) to their realizations
                 by stream transducers (on the low-level), where the
                 internal states of the latter (called memories) are
                 exploited for accommodating algebraic effects of
                 Plotkin and Power. The translation is compositional,
                 hence ``denotational,'' where transducers are
                 inductively composed using an adaptation of Barbosa's
                 coalgebraic component calculus. In the current paper we
                 extend the mGoI framework and provide a systematic
                 treatment of recursion---an essential feature of
                 programming languages that was however missing in our
                 previous work. Specifically, we introduce two new
                 fixed-point operators in the coalgebraic component
                 calculus. The two follow the previous work on recursion
                 in GoI and are called Girard style and Mackie style:
                 the former obviously exhibits some nice
                 domain-theoretic properties, while the latter allows
                 simpler construction. Their equivalence is established
                 on the categorical (or, traced monoidal) level of
                 abstraction, and is therefore generic with respect to
                 the choice of algebraic effects. Our main result is an
                 adequacy theorem of our mGoI translation, against
                 Plotkin and Power's operational semantics for algebraic
                 effects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Raychev:2016:LPN,
  author =       "Veselin Raychev and Pavol Bielik and Martin Vechev and
                 Andreas Krause",
  title =        "Learning programs from noisy data",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "761--774",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837671",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new approach for learning programs from
                 noisy datasets. Our approach is based on two new
                 concepts: a regularized program generator which
                 produces a candidate program based on a small sample of
                 the entire dataset while avoiding overfitting, and a
                 dataset sampler which carefully samples the dataset by
                 leveraging the candidate program's score on that
                 dataset. The two components are connected in a
                 continuous feedback-directed loop. We show how to apply
                 this approach to two settings: one where the dataset
                 has a bound on the noise, and another without a noise
                 bound. The second setting leads to a new way of
                 performing approximate empirical risk minimization on
                 hypotheses classes formed by a discrete search space.
                 We then present two new kinds of program synthesizers
                 which target the two noise settings. First, we
                 introduce a novel regularized bitstream synthesizer
                 that successfully generates programs even in the
                 presence of incorrect examples. We show that the
                 synthesizer can detect errors in the examples while
                 combating overfitting --- a major problem in existing
                 synthesis techniques. We also show how the approach can
                 be used in a setting where the dataset grows
                 dynamically via new examples (e.g., provided by a
                 human). Second, we present a novel technique for
                 constructing statistical code completion systems. These
                 are systems trained on massive datasets of open source
                 programs, also known as ``Big Code''. The key idea is
                 to introduce a domain specific language (DSL) over
                 trees and to learn functions in that DSL directly from
                 the dataset. These learned functions then condition the
                 predictions made by the system. This is a flexible and
                 powerful technique which generalizes several existing
                 works as we no longer need to decide a priori on what
                 the prediction should be conditioned (another benefit
                 is that the learned functions are a natural mechanism
                 for explaining the prediction). As a result, our code
                 completion system surpasses the prediction capabilities
                 of existing, hard-wired systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Bornholt:2016:OSM,
  author =       "James Bornholt and Emina Torlak and Dan Grossman and
                 Luis Ceze",
  title =        "Optimizing synthesis with metasketches",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "775--788",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837666",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many advanced programming tools---for both end-users
                 and expert developers---rely on program synthesis to
                 automatically generate implementations from high-level
                 specifications. These tools often need to employ
                 tricky, custom-built synthesis algorithms because they
                 require synthesized programs to be not only correct,
                 but also optimal with respect to a desired cost metric,
                 such as program size. Finding these optimal solutions
                 efficiently requires domain-specific search strategies,
                 but existing synthesizers hard-code the strategy,
                 making them difficult to reuse. This paper presents
                 metasketches, a general framework for specifying and
                 solving optimal synthesis problems. metasketches make
                 the search strategy a part of the problem definition by
                 specifying a fragmentation of the search space into an
                 ordered set of classic sketches. We provide two
                 cooperating search algorithms to effectively solve
                 metasketches. A global optimizing search coordinates
                 the activities of local searches, informing them of the
                 costs of potentially-optimal solutions as they explore
                 different regions of the candidate space in parallel.
                 The local searches execute an incremental form of
                 counterexample-guided inductive synthesis to
                 incorporate information sent from the global search. We
                 present Synapse, an implementation of these algorithms,
                 and show that it effectively solves optimal synthesis
                 problems with a variety of different cost functions. In
                 addition, metasketches can be used to accelerate
                 classic (non-optimal) synthesis by explicitly
                 controlling the search strategy, and we show that
                 Synapse solves classic synthesis problems that
                 state-of-the-art tools cannot.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Albarghouthi:2016:MSS,
  author =       "Aws Albarghouthi and Isil Dillig and Arie Gurfinkel",
  title =        "Maximal specification synthesis",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "789--801",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837628",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many problems in program analysis, verification, and
                 synthesis require inferring specifications of unknown
                 procedures. Motivated by a broad range of applications,
                 we formulate the problem of maximal specification
                 inference: Given a postcondition Phi and a program P
                 calling a set of unknown procedures F_1,...,F_n, what
                 are the most permissive specifications of procedures
                 F_i that ensure correctness of P? In other words, we
                 are looking for the smallest number of assumptions we
                 need to make about the behaviours of F_i in order to
                 prove that $P$ satisfies its postcondition. To solve
                 this problem, we present a novel approach that utilizes
                 a counterexample-guided inductive synthesis loop and
                 reduces the maximal specification inference problem to
                 multi-abduction. We formulate the novel notion of
                 multi-abduction as a generalization of classical
                 logical abduction and present an algorithm for solving
                 multi-abduction problems. On the practical side, we
                 evaluate our specification inference technique on a
                 range of benchmarks and demonstrate its ability to
                 synthesize specifications of kernel routines invoked by
                 device drivers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Frankle:2016:EDS,
  author =       "Jonathan Frankle and Peter-Michael Osera and David
                 Walker and Steve Zdancewic",
  title =        "Example-directed synthesis: a type-theoretic
                 interpretation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "1",
  pages =        "802--815",
  month =        jan,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2914770.2837629",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:57 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Input-output examples have emerged as a practical and
                 user-friendly specification mechanism for program
                 synthesis in many environments. While example-driven
                 tools have demonstrated tangible impact that has
                 inspired adoption in industry, their underlying
                 semantics are less well-understood: what are
                 ``examples'' and how do they relate to other kinds of
                 specifications? This paper demonstrates that examples
                 can, in general, be interpreted as refinement types.
                 Seen in this light, program synthesis is the task of
                 finding an inhabitant of such a type. This insight
                 provides an immediate semantic interpretation for
                 examples. Moreover, it enables us to exploit decades of
                 research in type theory as well as its correspondence
                 with intuitionistic logic rather than designing ad hoc
                 theoretical frameworks for synthesis from scratch. We
                 put this observation into practice by formalizing
                 synthesis as proof search in a sequent calculus with
                 intersection and union refinements that we prove to be
                 sound with respect to a conventional type system. In
                 addition, we show how to handle negative examples,
                 which arise from user feedback or counterexample-guided
                 loops. This theory serves as the basis for a prototype
                 implementation that extends our core language to
                 support ML-style algebraic data types and structurally
                 inductive functions. Users can also specify synthesis
                 goals using polymorphic refinements and import
                 monomorphic libraries. The prototype serves as a
                 vehicle for empirically evaluating a number of
                 different strategies for resolving the nondeterminism
                 of the sequent calculus---bottom-up theorem-proving,
                 term enumeration with refinement type checking, and
                 combinations of both---the results of which classify,
                 explain, and validate the design choices of existing
                 synthesis systems. It also provides a platform for
                 measuring the practical value of a specification
                 language that combines ``examples'' with the more
                 general expressiveness of refinements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '16 conference proceedings.",
}

@Article{Homer:2016:ALG,
  author =       "Michael Homer and Timothy Jones and James Noble",
  title =        "From {APIs} to languages: generalising method names",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "1--12",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816708",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Method names with multiple separate parts are a
                 feature of many dynamic languages derived from
                 Smalltalk. Generalising the syntax of method names to
                 allow parts to be repeated, optional, or alternatives,
                 means a single definition can respond to a whole family
                 of method requests. We show how generalising method
                 names can support flexible APIs for domain-specific
                 languages, complex initialisation tasks, and control
                 structures defined in libraries. We describe how we
                 have extended Grace to support generalised method
                 names, and prove that such an extension can be
                 integrated into a gradually-typed language while
                 preserving type soundness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Maidl:2016:FTL,
  author =       "Andr{\'e} Murbach Maidl and Fabio Mascarenhas and
                 Roberto Ierusalimschy",
  title =        "A formalization of typed {\tt lua}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "13--25",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816709",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmers often migrate from a dynamically typed to
                 a statically typed language when their simple scripts
                 evolve into complex programs. Optional type systems are
                 one way of having both static and dynamic typing in the
                 same language, while keeping its dynamically typed
                 semantics. This makes evolving a program from dynamic
                 to static typing a matter of describing the implied
                 types that it is using and adding annotations to make
                 those types explicit. Designing an optional type system
                 for an existing dynamically typed language is
                 challenging, as its types should feel natural to
                 programmers that are already familiar with this
                 language. In this work, we give a formal description of
                 Typed Lua, an optional type system for Lua, with a
                 focus on two of its novel type system features:
                 incremental evolution of imperative record and object
                 types that is both lightweight and type-safe, and
                 projection types, a combination of flow typing,
                 functions that return multiple values, and multiple
                 assignment. While our type system is tailored to the
                 features and idioms of Lua, its features can be adapted
                 to other imperative scripting languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Tanter:2016:GCP,
  author =       "{\'E}ric Tanter and Nicolas Tabareau",
  title =        "Gradual certified programming in {\tt coq}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "26--40",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816710",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Expressive static typing disciplines are a powerful
                 way to achieve high-quality software. However, the
                 adoption cost of such techniques should not be
                 under-estimated. Just like gradual typing allows for a
                 smooth transition from dynamically-typed to
                 statically-typed programs, it seems desirable to
                 support a gradual path to certified programming. We
                 explore gradual certified programming in Coq, providing
                 the possibility to postpone the proofs of selected
                 properties, and to check ``at runtime'' whether the
                 properties actually hold. Casts can be integrated with
                 the implicit coercion mechanism of Coq to support
                 implicit cast insertion {\`a} la gradual typing.
                 Additionally, when extracting Coq functions to
                 mainstream languages, our encoding of casts supports
                 lifting assumed properties into runtime checks. Much to
                 our surprise, it is not necessary to extend Coq in any
                 way to support gradual certified programming. A simple
                 mix of type classes and axioms makes it possible to
                 bring gradual certified programming to Coq in a
                 straightforward manner.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Ernst:2016:MSD,
  author =       "Erik Ernst and Anders M{\o}ller and Mathias Schwarz
                 and Fabio Strocco",
  title =        "Message safety in {Dart}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "41--53",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816711",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Unlike traditional static type checking, the type
                 system in the Dart programming language is unsound by
                 design, even for fully annotated programs. The
                 rationale has been that this allows compile-time
                 detection of likely errors and enables code completion
                 in integrated development environments, without being
                 restrictive on programmers. Despite unsoundness,
                 judicious use of type annotations can ensure useful
                 properties of the runtime behavior of Dart programs. We
                 present a formal model of a core of Dart with a focus
                 on its type system, which allows us to elucidate the
                 causes of unsoundness. Our main contribution is a
                 characterization of message-safe programs and a theorem
                 stating that such programs will never encounter
                 'message not understood' errors at runtime. Message
                 safety is less restrictive than traditional type
                 soundness, and we argue that it forms a natural
                 intermediate point between dynamically typed and
                 statically typed Dart programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Lyde:2016:CFA,
  author =       "Steven Lyde and William E. Byrd and Matthew Might",
  title =        "Control-flow analysis of dynamic languages via pointer
                 analysis",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "54--62",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816712",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We demonstrate how to map a control-flow analysis for
                 a higher-order language (dynamic languages are
                 typically higher-order) into a pointer analysis for a
                 first-order language, such as C. This allows us to use
                 existing pointer analysis tools to perform a
                 control-flow analysis, exploiting their technical
                 advancements and the engineering effort that went into
                 developing them. We compare the results of two recent
                 parallel pointer analysis tools with a parallel
                 control-flow analysis tool. While it has been known
                 that a control-flow analysis of higher-order languages
                 and a pointer analysis of first-order languages are
                 very similar, we demonstrate that these two analyses
                 are actually more similar than previously thought. We
                 present the first mapping between a high-order
                 control-flow analysis and a pointer analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Feeley:2016:CML,
  author =       "Marc Feeley",
  title =        "Compiling for multi-language task migration",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "63--77",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816713",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Task migration allows a running program to continue
                 its execution in a different destination environment.
                 Increasingly, execution environments are defined by
                 combinations of cultural and technological constraints,
                 affecting the choice of host language, libraries and
                 tools. A compiler supporting multiple target
                 environments and task migration must be able to marshal
                 continuations and then unmarshal and continue their
                 execution, ideally, even if the language of the
                 destination environment is different. In this paper, we
                 propose a compilation approach based on a virtual
                 machine that strikes a balance between implementation
                 portability and efficiency. We explain its
                 implementation within a Scheme compiler targeting
                 JavaScript, PHP, Python, Ruby and Java --- some of the
                 most popular host languages for web applications. As
                 our experiments show, this approach compares well with
                 other Scheme compilers targeting high-level languages
                 in terms of execution speed, being sometimes up to 3
                 orders of magnitude faster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Grimmer:2016:HPC,
  author =       "Matthias Grimmer and Chris Seaton and Roland Schatz
                 and Thomas W{\"u}rthinger and Hanspeter
                 M{\"o}ssenb{\"o}ck",
  title =        "High-performance cross-language interoperability in a
                 multi-language runtime",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "78--90",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816714",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmers combine different programming languages
                 because it allows them to use the most suitable
                 language for a given problem, to gradually migrate
                 existing projects from one language to another, or to
                 reuse existing source code. However, existing
                 cross-language mechanisms suffer from complex
                 interfaces, insufficient flexibility, or poor
                 performance. We present the TruffleVM, a multi-language
                 runtime that allows composing different language
                 implementations in a seamless way. It reduces the
                 amount of required boiler-plate code to a minimum by
                 allowing programmers to access foreign functions or
                 objects by using the notation of the host language. We
                 compose language implementations that translate source
                 code to an intermediate representation (IR), which is
                 executed on top of a shared runtime system. Language
                 implementations use language-independent messages that
                 the runtime resolves at their first execution by
                 transforming them to efficient
                 foreign-language-specific operations. The TruffleVM
                 avoids conversion or marshaling of foreign objects at
                 the language boundary and allows the dynamic compiler
                 to perform its optimizations across language
                 boundaries, which guarantees high performance. This
                 paper presents an implementation of our ideas based on
                 the Truffle system and its guest language
                 implementations JavaScript, Ruby, and C.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Leopoldseder:2016:JJT,
  author =       "David Leopoldseder and Lukas Stadler and Christian
                 Wimmer and Hanspeter M{\"o}ssenb{\"o}ck",
  title =        "{Java-to-JavaScript} translation via structured
                 control flow reconstruction of compiler {IR}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "91--103",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816715",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an approach to cross-compile Java bytecodes
                 to Java-Script, building on existing Java optimizing
                 compiler technology. Static analysis determines which
                 Java classes and methods are reachable. These are then
                 translated to JavaScript using a re-configured Java
                 just-in-time compiler with a new back end that
                 generates JavaScript instead of machine code. Standard
                 compiler optimizations such as method inlining and
                 global value numbering, as well as advanced
                 optimizations such as escape analysis, lead to compact
                 and optimized JavaScript code. Compiler IR is
                 unstructured, so structured control flow needs to be
                 reconstructed before code generation is possible. We
                 present details of our control flow reconstruction
                 algorithm. Our system is based on Graal, an open-source
                 optimizing compiler for the Java HotSpot VM and other
                 VMs. The modular and VM-independent architecture of
                 Graal allows us to reuse the intermediate
                 representation, the bytecode parser, and the high-level
                 optimizations. Our custom back end first performs
                 control flow reconstruction and then JavaScript code
                 generation. The generated JavaScript undergoes a set of
                 optimizations to increase readability and performance.
                 Static analysis is performed on the Graal intermediate
                 representation as well. Benchmark results for
                 medium-sized Java benchmarks such as SPECjbb2005 run
                 with acceptable performance on the V8 JavaScript VM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Pape:2016:LIS,
  author =       "Tobias Pape and Tim Felgentreff and Robert Hirschfeld
                 and Anton Gulenko and Carl Friedrich Bolz",
  title =        "Language-independent storage strategies for tracing
                 {JIT}-based virtual machines",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "104--113",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816716",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Storage strategies have been proposed as a run-time
                 optimization for the PyPy Python implementation and
                 have shown promising results for optimizing execution
                 speed and memory requirements. However, it remained
                 unclear whether the approach works equally well in
                 other dynamic languages. Furthermore, while PyPy is
                 based on RPython, a language to write VMs with reusable
                 components such as a tracing just-in-time compiler and
                 garbage collection, the strategies design itself was
                 not generalized to be reusable across languages
                 implemented using that same toolchain. In this paper,
                 we present a general design and implementation for
                 storage strategies and show how they can be reused
                 across different RPython-based languages. We evaluate
                 the performance of our implementation for RSqueak, an
                 RPython-based VM for Squeak/Smalltalk and show that
                 storage strategies may indeed offer performance
                 benefits for certain workloads in other dynamic
                 programming languages.We furthermore evaluate the
                 generality of our implementation by applying it to
                 Topaz, a Ruby VM, and Pycket, a Racket
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Aakerblom:2016:MPP,
  author =       "Beatrice {\AA}kerblom and Tobias Wrigstad",
  title =        "Measuring polymorphism in {Python} programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "114--128",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816717",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Following the increased popularity of dynamic
                 languages and their increased use in critical software,
                 there have been many proposals to retrofit static type
                 system to these languages to improve possibilities to
                 catch bugs and improve performance. A key question for
                 any type system is whether the types should be
                 structural, for more expressiveness, or nominal, to
                 carry more meaning for the programmer. For retrofitted
                 type systems, it seems the current trend is using
                 structural types. This paper attempts to answer the
                 question to what extent this extra expressiveness is
                 needed, and how the possible polymorphism in dynamic
                 code is used in practise. We study polymorphism in 36
                 real-world open source Python programs and approximate
                 to what extent nominal and structural types could be
                 used to type these programs. The study is based on
                 collecting traces from multiple runs of the programs
                 and analysing the polymorphic degrees of targets at
                 more than 7 million call-sites. Our results show that
                 while polymorphism is used in all programs, the
                 programs are to a great extent monomorphic. The
                 polymorphism found is evenly distributed across
                 libraries and program-specific code and occur both
                 during program start-up and normal execution. Most
                 programs contain a few ``megamorphic'' call-sites where
                 receiver types vary widely. The non-monomorphic parts
                 of the programs can to some extent be typed with
                 nominal or structural types, but none of the approaches
                 can type entire programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Alcocer:2016:TPV,
  author =       "Juan Pablo Sandoval Alcocer and Alexandre Bergel",
  title =        "Tracking down performance variation against source
                 code evolution",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "129--139",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816718",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Little is known about how software performance evolves
                 across software revisions. The severity of this
                 situation is high since (i) most performance variations
                 seem to happen accidentally and (ii) addressing a
                 performance regression is challenging, especially when
                 functional code is stacked on it. This paper reports an
                 empirical study on the performance evolution of 19
                 applications, totaling over 19 MLOC. It took 52 days to
                 run our 49 benchmarks. By relating performance
                 variation with source code revisions, we found out
                 that: (i) 1 out of every 3 application revisions
                 introduces a performance variation, (ii) performance
                 variations may be classified into 9 patterns, (iii) the
                 most prominent cause of performance regression involves
                 loops and collections. We carefully describe the
                 patterns we identified, and detail how we addressed the
                 numerous challenges we faced to complete our
                 experiment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Kedlaya:2016:SST,
  author =       "Madhukar N. Kedlaya and Behnam Robatmili and Ben
                 Hardekopf",
  title =        "Server-side type profiling for optimizing client-side
                 {JavaScript} engines",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "140--153",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816719",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern JavaScript engines optimize hot functions using
                 a JIT compiler along with type information gathered by
                 an online profiler. However, the profiler's information
                 can be unsound and when unexpected types are
                 encountered the engine must recover using an expensive
                 mechanism called deoptimization. In this paper we
                 describe a method to significantly reduce the number of
                 deoptimizations observed by client-side JavaScript
                 engines by using ahead-of-time profiling on the
                 server-side. Unlike previous work on ahead-of-time
                 profiling for statically-typed languages such as Java,
                 our technique must operate on a dynamically-typed
                 language, which significantly changes the required
                 insights and methods to make the technique effective.
                 We implement our proposed technique using the
                 SpiderMonkey JavaScript engine, and we evaluate our
                 implementation using three different kinds of
                 benchmarks: the industry-standard Octane benchmark
                 suite, a set of JavaScript physics engines, and a set
                 of real-world websites from the Membench50 benchmark
                 suite. We show that using ahead-of-time profiling
                 provides significant performance benefits over the
                 baseline vanilla SpiderMonkey engine.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Fischer:2016:EIE,
  author =       "Lars Fischer and Stefan Hanenberg",
  title =        "An empirical investigation of the effects of type
                 systems and code completion on {API} usability using
                 {TypeScript} and {JavaScript} in {MS Visual Studio}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "154--167",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816720",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent empirical studies that compared static and
                 dynamic type systems on API usability showed a positive
                 impact of static type systems on developer productivity
                 in most cases. Nevertheless, it is unclear how large
                 this effect is in comparison to other factors. One
                 obvious factor in programming is tooling: It is
                 commonly accepted that modern IDEs have a large
                 positive impact on developers, although it is not clear
                 which parts of modern IDEs are responsible for that.
                 One possible---and for most developers obvious
                 candidate---is code completion. This paper describes a
                 2x2 randomized trial that compares JavaScript and
                 Microsoft's statically typed alternative TypeScript
                 with and without code completion in MS Visual Studio.
                 While the experiment shows (in correspondence to
                 previous experiments) a large positive effect of the
                 statically typed language TypeScript, the code
                 completion effect is not only marginal, but also just
                 approaching statistical significance. This seems to be
                 an indicator that the effect of static type systems is
                 larger than often assumed, at least in comparison to
                 code completion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Teruel:2016:ACR,
  author =       "Camille Teruel and St{\'e}phane Ducasse and Damien
                 Cassou and Marcus Denker",
  title =        "Access control to reflection with object ownership",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "2",
  pages =        "168--176",
  month =        feb,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936313.2816721",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reflection is a powerful programming language feature
                 that enables language extensions, generic code, dynamic
                 analyses, development tools, etc. However, uncontrolled
                 reflection breaks object encapsulation and considerably
                 increases the attack surface of programs e.g.,
                 malicious libraries can use reflection to attack their
                 client applications. To bring reflection and object
                 encapsulation back together, we use dynamic object
                 ownership to design an access control policy to
                 reflective operations. This policy grants objects full
                 reflective power over the objects they own but limited
                 reflective power over other objects. Code is still able
                 to use advanced reflective operations but reflection
                 cannot be used as an attack vector anymore.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '15 conference proceedings.",
}

@Article{Narasimhan:2016:NGS,
  author =       "Priya Narasimhan and Utsav Drolia and Jiaqi Tan and
                 Nathan D. Mickulicz and Rajeev Gandhi",
  title =        "The next-generation in-stadium experience (keynote)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "1--10",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814205",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "YinzCam is a cloud-hosted service that provides sports
                 fans with real-time scores, news, photos, statistics,
                 live radio, streaming video, etc., on their mobile
                 devices. YinzCam's infrastructure is currently hosted
                 on Amazon Web Services (AWS) and supports over 30
                 million installs of the official mobile apps of 140+
                 NHL/NFL/NBA/NRL/NCAA sports teams and venues. YinzCam's
                 workload is necessarily multi-modal (e.g., pre-game,
                 in-game, post-game, game-day, non-gameday), with normal
                 game-time traffic being twenty-fold of that on non-game
                 days. This paper describes the evolution of YinzCam's
                 production architecture and distributed infrastructure,
                 from its beginnings in 2009, when it was used to
                 support thousands of concurrent users, to today's
                 system that supports millions of concurrent users on
                 any game day. We also discuss key new opportunities to
                 improve the fan experience inside the stadium of the
                 future, without impacting the available bandwidth, by
                 crowd-sourcing the thousands of mobile devices that are
                 in fans' hands inside these venues. We present Krowd, a
                 novel distributed key-value store for promoting
                 efficient content sharing, discovery and retrieval
                 across the mobile devices inside a stadium. We present
                 CHIPS, a system that ensures that users' privacy is
                 maintained while their devices participate in the
                 crowdsourced infrastructure.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Leissa:2016:SED,
  author =       "Roland Lei{\ss}a and Klaas Boesche and Sebastian Hack
                 and Richard Membarth and Philipp Slusallek",
  title =        "Shallow embedding of {DSLs} via online partial
                 evaluation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "11--20",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814208",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper investigates shallow embedding of DSLs by
                 means of online partial evaluation. To this end, we
                 present a novel online partial evaluator for
                 continuation-passing style languages. We argue that it
                 has, in contrast to prior work, a predictable
                 termination policy that works well in practice. We
                 present our approach formally using a
                 continuation-passing variant of PCF and prove its
                 termination properties. We evaluate our technique
                 experimentally in the field of visual and
                 high-performance computing and show that our evaluator
                 produces highly specialized and efficient code for CPUs
                 as well as GPUs that matches the performance of
                 hand-tuned expert code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Scherr:2016:AFC,
  author =       "Maximilian Scherr and Shigeru Chiba",
  title =        "Almost first-class language embedding: taming staged
                 embedded {DSLs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "21--30",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814217",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Embedded domain-specific languages (EDSLs), inheriting
                 a general-purpose language's features as well as
                 look-and-feel, have traditionally been second-class or
                 rather non-citizens in terms of host-language design.
                 This makes sense when one regards them to be on the
                 same level as traditional, non-EDSL library interfaces.
                 However, this equivalence only applies to the simplest
                 of EDSLs. In this paper we illustrate why this is
                 detrimental when moving on to EDSLs that employ
                 staging, i.e. program reification, by example of
                 various issues that affect authors and users alike. We
                 believe that if EDSLs are to be considered a reliable,
                 language-like interface abstraction, they require
                 exceptional attention and design scrutiny. Instead of
                 unenforceable conventions, we advocate the acceptance
                 of EDSLs as proper, i.e. almost first-class, citizens
                 while retaining most advantages of pure embeddings. As
                 a small step towards this goal, we present a pragmatic
                 framework prototype for Java. It is based on
                 annotations that explicate and document membership to
                 explicit EDSL entities. In a nutshell, our framework
                 identifies (annotated) method calls and field accesses
                 as EDSL terms and dynamically constructs an
                 abstract-syntax representation, which is eventually
                 passed to a semantics-defining back end implemented by
                 the EDSL author.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Reynders:2016:GSB,
  author =       "Bob Reynders and Dominique Devriese and Frank
                 Piessens",
  title =        "Generating safe boundary {APIs} between typed {EDSLs}
                 and their environments",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "31--34",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814219",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Embedded domain specific languages (EDSLs) are used to
                 represent special-purpose code in a general-purpose
                 language and they are used for applications like vector
                 calculations and run-time code generation. Often, code
                 in an EDSL is compiled to a target (e.g. GPU languages,
                 JVM bytecode, assembly, JavaScript) and needs to
                 interface with other code that is available at that
                 level but uses other data representations or calling
                 conventions. We present an approach for safely making
                 available such APIs in a typed EDSL, guaranteeing
                 correct conversions between data representations and
                 the respect for calling conventions. When the code
                 being interfaced with is the result of static
                 compilation of host language code, we propose a way to
                 auto-generate the needed boilerplate using
                 meta-programming. We instantiate our technique with
                 JavaScript as the target language, JS-Scala as the
                 EDSL, Scala.js as the static compiler and Scala macros
                 to generate the boilerplate, but our design is more
                 generally applicable. We provide evidence of usefulness
                 of our approach through a prototype implementation that
                 we have applied in a non-trivial code base.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Medeiros:2016:ESC,
  author =       "Fl{\'a}vio Medeiros and Iran Rodrigues and M{\'a}rcio
                 Ribeiro and Leopoldo Teixeira and Rohit Gheyi",
  title =        "An empirical study on configuration-related issues:
                 investigating undeclared and unused identifiers",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "35--44",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814206",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The variability of configurable systems may lead to
                 configuration-related issues (i.e., faults and
                 warnings) that appear only when we select certain
                 configuration options. Previous studies found that
                 issues related to configurability are harder to detect
                 than issues that appear in all configurations, because
                 variability increases the complexity. However, little
                 effort has been put into understanding
                 configuration-related faults (e.g., undeclared
                 functions and variables) and warnings (e.g., unused
                 functions and variables). To better understand the
                 peculiarities of configuration-related
                 undeclared/unused variables and functions, in this
                 paper we perform an empirical study of 15 systems to
                 answer research questions related to how developers
                 introduce these issues, the number of configuration
                 options involved, and the time that these issues remain
                 in source files. To make the analysis of several
                 projects feasible, we propose a strategy that minimizes
                 the initial setup problems of variability-aware tools.
                 We detect and confirm 2 undeclared variables, 14
                 undeclared functions, 16 unused variables, and 7 unused
                 functions related to configurability. We submit 30
                 patches to fix issues not fixed by developers. Our
                 findings support the effectiveness of sampling (i.e.,
                 analysis of only a subset of valid configurations)
                 because most issues involve two or less configuration
                 options. Nevertheless, by analyzing the version history
                 of the projects, we observe that a number of issues
                 remain in the code for several years. Furthermore, the
                 corpus of undeclared/unused variables and functions
                 gathered is a valuable source to study these issues,
                 compare sampling algorithms, and test and improve
                 variability-aware tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{El-Sharkawy:2016:AKS,
  author =       "Sascha El-Sharkawy and Adam Krafczyk and Klaus
                 Schmid",
  title =        "Analysing the {Kconfig} semantics and its analysis
                 tools",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "45--54",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814222",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Linux kernel is often used as a real world case
                 study to demonstrate novel Software Product Line
                 Engineering research methods. An important point in
                 this is often the analysis of the Kconfig semantics.
                 However, we detected that the semantics of Kconfig is
                 rather unclear and has many special cases, which are
                 not documented in its short specification. We performed
                 a systematic analysis to uncover the correct behaviour
                 of Kconfig and present the results, which are necessary
                 for applying semantically correct analyses. Further, we
                 analyse existing analysis tools of the research
                 community whether they are aware of the correct
                 semantics of Kconfig. These analyses can be used for
                 improving existing analysis tools as well as decision
                 support for selecting an appropriate tool for a
                 specific analysis. In summary we contribute to a better
                 understanding of Kconfig in the research community to
                 improve the validity of evaluations based on Linux.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Alshara:2016:MLO,
  author =       "Zakarea Alshara and Abdelhak-Djamel Seriai and Chouki
                 Tibermacine and Hinde Lilia Bouziane and Christophe
                 Dony and Anas Shatnawi",
  title =        "Migrating large object-oriented applications into
                 component-based ones: instantiation and inheritance
                 transformation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "55--64",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814223",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large object-oriented applications have complex and
                 numerous dependencies, and usually do not have explicit
                 software architectures. Therefore they are hard to
                 maintain, and parts of them are difficult to reuse.
                 Component-based development paradigm emerged for
                 improving these aspects and for supporting effective
                 maintainability and reuse. It provides better
                 understandability through a high-level architecture
                 view of the application. Thereby migrating
                 object-oriented applications to component-based ones
                 will contribute to improve these characteristics
                 (maintainability and reuse). In this paper, we propose
                 an approach to automatically transform object-oriented
                 applications to component-based ones. More
                 particularly, the input of the approach is the result
                 provided by software architecture recovery: a
                 component-based architecture description. Then, our
                 approach transforms the object-oriented source code in
                 order to produce deployable components. We focus in
                 this paper on the transformation of source code related
                 to instantiation and inheritance dependencies between
                 classes that are in different components. We
                 experimented the proposed solution in the
                 transformation of a collection of Java applications
                 into the OSGi framework. The experimental results are
                 discussed in this paper.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Lopez:2016:SSP,
  author =       "Michael Lopez and C. Jasson Casey and Gabriel {Dos
                 Reis} and Colton Chojnacki",
  title =        "Safer {SDN} programming through {Arbiter}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "65--74",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814218",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software Defined Networking (SDN) programs are written
                 with respect to assumptions on software and hardware
                 facilities and protocol definitions. Silent mismatches
                 between the expected feature set and implemented
                 feature set of SDN artifacts can easily lead to hard to
                 debug network configurations, decreased network
                 performance, outages, or worse, security
                 vulnerabilities. We show how the paradigm of axiomatic
                 programming, supported by practical dependent types,
                 provides effective support for SDN executable
                 specifications and verification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Kolesnichenko:2016:CBG,
  author =       "Alexey Kolesnichenko and Christopher M. Poskitt and
                 Sebastian Nanz and Bertrand Meyer",
  title =        "Contract-based general-purpose {GPU} programming",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "75--84",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814216",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Using GPUs as general-purpose processors has
                 revolutionized parallel computing by offering, for a
                 large and growing set of algorithms, massive
                 data-parallelization on desktop machines. An obstacle
                 to widespread adoption, however, is the difficulty of
                 programming them and the low-level control of the
                 hardware required to achieve good performance. This
                 paper suggests a programming library, SafeGPU, that
                 aims at striking a balance between programmer
                 productivity and performance, by making GPU
                 data-parallel operations accessible from within a
                 classical object-oriented programming language. The
                 solution is integrated with the design-by-contract
                 approach, which increases confidence in functional
                 program correctness by embedding executable program
                 specifications into the program text. We show that our
                 library leads to modular and maintainable code that is
                 accessible to GPGPU non-experts, while providing
                 performance that is comparable with hand-written CUDA
                 code. Furthermore, runtime contract checking turns out
                 to be feasible, as the contracts can be executed on the
                 GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Yamaguchi:2016:IMS,
  author =       "Hiroshi Yamaguchi and Shigeru Chiba",
  title =        "Inverse macro in {Scala}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "85--94",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814213",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a new variant of typed syntactic macro
                 systems named inverse macro, which improves the
                 expressiveness of macro systems. The inverse macro
                 system enables to implement operators with complex
                 side-effects, such as lazy operators and delimited
                 continuation operators, which are beyond the power of
                 existing macro systems. We have implemented the inverse
                 macro system as an extension to Scala 2.11. We also
                 show the expressiveness of the inverse macro system by
                 comparing two versions of shift/reset, bundled in Scala
                 2.11 and implemented with the inverse macro system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Adam:2016:TTS,
  author =       "Sorin Adam and Ulrik Pagh Schultz",
  title =        "Towards tool support for spreadsheet-based
                 domain-specific languages",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "95--98",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814215",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Spreadsheets are commonly used by non-programmers to
                 store data in a structured form, this data can in some
                 cases be considered to be a program in a
                 domain-specific language (DSL). Unlike ordinary
                 text-based domain-specific languages, there is however
                 currently no formalism for expressing the syntax of
                 such spreadsheet-based DSLs (SDSLs), and there is no
                 tool support for automatically generating language
                 infrastructure such as parsers and IDE support. In this
                 paper we define a simple notion of two-dimensional
                 grammars for SDSLs, and show how such grammars can be
                 used for automatically generating parsers that extract
                 structured data from a spreadsheet in the form of an
                 AST. We demonstrate automatic generation of parsers for
                 a number of examples, including the questionnaire DSL
                 from LWC2014 and a DSL for writing safety
                 specifications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Byalik:2016:NNA,
  author =       "Antuan Byalik and Sanchit Chadha and Eli Tilevich",
  title =        "Native-$2$-native: automated cross-platform code
                 synthesis from web-based programming resources",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "99--108",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814210",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For maximal market penetration, popular mobile
                 applications are typically supported on all major
                 platforms, including Android and iOS. Despite the vast
                 differences in the look-and-feel of major mobile
                 platforms, applications running on these platforms in
                 essence provide the same core functionality. As an
                 application is maintained and evolved, the resulting
                 changes must be replicated on all the supported
                 platforms, a tedious and error-prone programming
                 process. Existing automated source-to-source
                 translation tools prove inadequate due to the
                 structural and idiomatic differences in how
                 functionalities are expressed across major platforms.
                 In this paper, we present a new
                 approach---Native-2-Native---that automatically
                 synthesizes code for a mobile application to make use
                 of native resources on one platform, based on the
                 equivalent program transformations performed on another
                 platform. First, the programmer modifies a mobile
                 application's Android version to make use of some
                 native resource, with a plugin capturing code changes.
                 Based on the changes, the system then parameterizes a
                 web search query over popular programming resources
                 (e.g., Google Code, StackOverflow, etc.), to discover
                 equivalent iOS code blocks with the closest similarity
                 to the programmer-written Android code. The discovered
                 iOS code block is then presented to the programmer as
                 an automatically synthesized Swift source file to
                 further fine-tune and subsequently integrate in the
                 mobile application's iOS version. Our evaluation,
                 enhancing mobile applications to make use of common
                 native resources, shows that the presented approach can
                 correctly synthesize more than 86\% of Swift code for
                 the subject applications' iOS versions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Makarov:2016:CMS,
  author =       "Dmitri Makarov and Matthias Hauswirth",
  title =        "{CLOP}: a multi-stage compiler to seamlessly embed
                 heterogeneous code",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "109--112",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814211",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous programming complicates software
                 development. We present CLOP, a platform that embeds
                 code targeting heterogeneous compute devices in a
                 convenient and clean way, allowing unobstructed data
                 flow between the host code and the devices, reducing
                 the amount of source code by an order of magnitude. The
                 CLOP compiler uses the standard facilities of the D
                 programming language to generate code strictly at
                 compile-time. In this paper we describe the CLOP
                 language and the CLOP compiler implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Ringert:2016:CCG,
  author =       "Jan Oliver Ringert and Bernhard Rumpe and Andreas
                 Wortmann",
  title =        "Composing code generators for {C\&C} {ADLs} with
                 Application-specific behavior languages (tool
                 demonstration)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "113--116",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814224",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modeling software systems as component {\&} connector
                 architectures with application-specific behavior
                 modeling languages enables domain experts to describe
                 each component behavior with the most appropriate
                 language. Generating executable systems for such
                 language aggregates requires composing appropriate code
                 generators for the participating languages. Previous
                 work on code generator composition either focuses on
                 white-box integration based on code generator internals
                 or requires extensive handcrafting of integration code.
                 We demonstrate an approach to black-box generator
                 composition for architecture description languages that
                 relies on explicit interfaces and exploits the
                 encapsulation of components. This approach is
                 implemented for the architecture modeling framework
                 MontiArcAutomaton and has been evaluated in various
                 contexts. Ultimately, black-box code generator
                 composition facilitates development of code generators
                 for architecture description languages with embedded
                 behavior languages and increases code generator
                 reuse.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Kabac:2016:OMS,
  author =       "Milan Kab{\'a}c and Charles Consel",
  title =        "Orchestrating masses of sensors: a design-driven
                 development approach",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "117--120",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814226",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper proposes a design-driven development
                 approach that is dedicated to the domain of
                 orchestration of masses of sensors. The developer
                 declares what an application does using a
                 domain-specific language (DSL). Our compiler processes
                 domain-specific declarations to generate a customized
                 programming framework that guides and supports the
                 programming phase.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Foust:2016:GRP,
  author =       "Gabriel Foust and Jaakko J{\"a}rvi and Sean Parent",
  title =        "Generating reactive programs for graphical user
                 interfaces from multi-way dataflow constraint systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "121--130",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814207",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For a GUI to remain responsive, it must be able to
                 schedule lengthy tasks to be executed asynchronously.
                 In the traditional approach to GUI
                 implementation--writing functions to handle individual
                 user events--asynchronous programming easily leads to
                 defects. Ensuring that all data dependencies are
                 respected is difficult when new events arrive while
                 prior events are still being handled. Reactive
                 programming techniques, gaining popularity in GUI
                 programming, help since they make data dependencies
                 explicit and enforce them automatically as variables'
                 values change. However, data dependencies in GUIs
                 usually change along with its state. Reactive
                 programming must therefore describe a GUI as a
                 collection of many reactive programs, whose interaction
                 the programmer must explicitly coordinate. This paper
                 presents a declarative approach for GUI programming
                 that relieves the programmer from coordinating
                 asynchronous computations. The approach is based on our
                 prior work on ``property models'', where GUI state is
                 maintained by a dataflow constraint system. A property
                 model responds to user events by atomically
                 constructing new data dependencies and scheduling
                 asynchronous computations to enforce those
                 dependencies. In essence, a property model dynamically
                 generates a reactive program, adding to it as new
                 events occur. The approach gives the following
                 guarantee: the same sequence of events produces the
                 same results, regardless of the timing of those
                 events.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Florence:2016:PPP,
  author =       "Spencer P. Florence and Bruke Fetscher and Matthew
                 Flatt and William H. Temps and Tina Kiguradze and
                 Dennis P. West and Charlotte Niznik and Paul R. Yarnold
                 and Robert Bruce Findler and Steven M. Belknap",
  title =        "{POP-PL}: a patient-oriented prescription programming
                 language",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "131--140",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814221",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Medical professionals have long used algorithmic
                 thinking to describe and implement health care
                 processes without the benefit of the conceptual
                 framework provided by a programming language. Instead,
                 medical algorithms are expressed using English,
                 flowcharts, or data tables. This results in
                 prescriptions that are difficult to understand, hard to
                 debug, and awkward to reuse. This paper reports on the
                 design and evaluation of a domain-specific programming
                 language, POP-PL for expressing medical algorithms. The
                 design draws on the experience of researchers in two
                 disciplines, programming languages and medicine. The
                 language is based around the idea that programs and
                 humans have complementary strengths, that when combined
                 can make for safer, more accurate performance of
                 prescriptions. We implemented a prototype of our
                 language and evaluated its design by writing
                 prescriptions in the new language and administering a
                 usability survey to medical professionals. This
                 formative evaluation suggests that medical
                 prescriptions can be conveyed by a programming
                 language's mode of expression and provides useful
                 information for refining the language. Analysis of the
                 survey results suggests that medical professionals can
                 understand and correctly modify programs in POP-PL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Selgrad:2016:LGV,
  author =       "Kai Selgrad and Alexander Lier and Franz K{\"o}ferl
                 and Marc Stamminger and Daniel Lohmann",
  title =        "Lightweight, generative variant exploration for
                 high-performance graphics applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "141--150",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814220",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Rendering performance is an everlasting goal of
                 computer graphics and significant driver for advances
                 in both, hardware architecture and algorithms. Thereby,
                 it has become possible to apply advanced computer
                 graphics technology even in low-cost embedded
                 appliances, such as car instruments. Yet, to come up
                 with an efficient implementation, developers have to
                 put enormous efforts into hardware/problem-specific
                 tailoring, fine-tuning, and domain exploration, which
                 requires profound expert knowledge. If a good solution
                 has been found, there is a high probability that it
                 does not work as well with other architectures or even
                 the next hardware generation. Generative DSL-based
                 approaches could mitigate these efforts and provide for
                 an efficient exploration of algorithmic variants and
                 hardware-specific tuning ideas. However, in vertically
                 organized industries, such as automotive, suppliers are
                 reluctant to introduce these techniques as they fear
                 loss of control, high introduction costs, and
                 additional constraints imposed by the OEM with respect
                 to software and tool-chain certification. Moreover,
                 suppliers do not want to share their generic solutions
                 with the OEM, but only concrete instances. To this end,
                 we propose a light-weight and incremental approach for
                 meta programming of graphics applications. Our approach
                 relies on an existing formulation of C-like languages
                 that is amenable to meta programming, which we extend
                 to become a lightweight language to combine algorithmic
                 features. Our method provides a concise notation for
                 meta programs and generates easily sharable output in
                 the appropriate C-style target language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Seidl:2016:GSP,
  author =       "Christoph Seidl and Sven Schuster and Ina Schaefer",
  title =        "Generative software product line development using
                 variability-aware design patterns",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "151--160",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814212",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software Product Lines (SPLs) are an approach to reuse
                 in-the-large that models a set of closely related
                 software systems in terms of commonalities and
                 variabilities. Design patterns are best practices for
                 addressing recurring design problems in object-oriented
                 source code. In the practice of implementing an SPL,
                 instances of certain design patterns are employed to
                 handle variability, which makes these
                 ``variability-aware design patterns'' a best practice
                 for SPL design. However, there currently is no
                 dedicated method for proactively developing SPL using
                 design patterns suitable for realizing variable
                 functionality. In this paper, we present a method to
                 perform generative SPL development with design
                 patterns. We use role models to capture design patterns
                 and their relation to a variability model. We further
                 allow mapping of individual design pattern roles to
                 elements of realization artifacts to be generated
                 (e.g., classes, methods) and check the conformance of
                 the realization with the specification of the pattern.
                 With this method, we support proactive development of
                 SPL using design patterns to apply best practices for
                 the realization of variability. We present an
                 implementation of our approach within the Eclipse IDE
                 and demonstrate it within a case study.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Font:2016:AMR,
  author =       "Jaime Font and Lorena Arcega and {\O}ystein Haugen and
                 Carlos Cetina",
  title =        "Addressing metamodel revisions in model-based software
                 product lines",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "161--170",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814214",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Metamodels evolve over time, which can break the
                 conformance between the models and the metamodel. Model
                 migration strategies aim to co-evolve models and
                 metamodels together, but their application is not fully
                 automatizable and is thus cumbersome and error prone.
                 We introduce the Variable MetaModel (VMM) strategy to
                 address the evolution of the reusable model assets of a
                 model-based Software Product Line. The VMM strategy
                 applies variability modeling ideas to express the
                 evolution of the metamodel in terms of commonalities
                 and variabilities. When the metamodel evolves, the
                 models continue to conform to the VMM, avoiding the
                 need for migration. We have applied both the
                 traditional migration strategy and the VMM strategy to
                 a retrospective case study that includes 13 years of
                 evolution of our industrial partner, an induction hobs
                 manufacturer. The comparison between the two strategies
                 shows better results for the VMM strategy in terms of
                 model indirection, automation, and trust leak.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Inostroza:2016:MIM,
  author =       "Pablo Inostroza and Tijs van der Storm",
  title =        "Modular interpreters for the masses: implicit context
                 propagation using object algebras",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "171--180",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814209",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modular interpreters have the potential to achieve
                 component-based language development: instead of
                 writing language interpreters from scratch, they can be
                 assembled from reusable, semantic building blocks.
                 Unfortunately, traditional language interpreters are
                 hard to extend because different language constructs
                 may require different interpreter signatures. For
                 instance, arithmetic interpreters produce a value
                 without any context information, whereas binding
                 constructs require an additional environment. In this
                 paper, we present a practical solution to this problem
                 based on implicit context propagation. By structuring
                 denotational-style interpreters as Object Algebras,
                 base interpreters can be retroactively lifted into new
                 interpreters that have an extended signature. The
                 additional parameters are implicitly propagated behind
                 the scenes, through the evaluation of the base
                 interpreter. Interpreter lifting enables a flexible
                 style of component-based language development. The
                 technique works in mainstream object-oriented
                 languages, does not sacrifice type safety or separate
                 compilation, and can be easily automated. We illustrate
                 implicit context propagation using a modular definition
                 of Featherweight Java and its extension to support
                 side-effects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Noguera:2016:MQQ,
  author =       "Carlos Noguera and Viviane Jonckers",
  title =        "Model querying with query models",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "3",
  pages =        "181--184",
  month =        mar,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2936314.2814225",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:58 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Model querying is an integral part of Model-Driven
                 Engineering. Developers query models when specifying
                 model transformations, when defining model constraints,
                 or simply when they need to extract some information
                 from the model. Model queries are often specified in a
                 general-purpose programming language, with developers
                 just navigating models through their programming
                 interfaces. OCL is the best known model query language,
                 and while successful, it is difficult to express
                 complex structural properties featured in target model
                 elements. In this paper we describe a model query
                 facility that aims at easing the description of
                 structural features in a query. In our approach,
                 developers model their queries by reusing fragments of
                 the target model to specify the invariant parts of the
                 template, augmented with variables and special
                 relations to specify what can vary. The query itself
                 conforms to a meta-model that extends the meta-model
                 under-query. By reusing the queried meta- model
                 developers can reduce the mental overhead that comes
                 from using a different language to specify the queries.
                 We have developed a proof of concept tool for the
                 Eclipse Modeling Framework (EMF) that (1) generates a
                 query meta-model from a target meta- model, (2) allows
                 the construction of queries using a graphical,
                 graph-based editor and (3) executes the queries by
                 translating them to a set of logic predicates that are
                 then evaluated using an extension of the logic-based
                 query language Ekeko.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '15 conference proceedings.",
}

@Article{Zhou:2016:PUH,
  author =       "Yuanyuan Zhou",
  title =        "Programming Uncertain {$<$T$>$ hings}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "1--2",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872416",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Innovation flourishes with good abstractions. For
                 instance, codification of the IEEE Floating Point
                 standard in 1985 was critical to the subsequent success
                 of scientific computing. Programming languages
                 currently lack appropriate abstractions for uncertain
                 data. Applications already use estimates from sensors,
                 machine learning, big data, humans, and approximate
                 algorithms, but most programming languages do not help
                 developers address correctness, programmability, and
                 optimization problems due to estimates. To address
                 these problems, we propose a new programming
                 abstraction called Uncertain We encourage the community
                 to develop and use abstractions for estimates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Abadal:2016:WAF,
  author =       "Sergi Abadal and Albert Cabellos-Aparicio and Eduard
                 Alarcon and Josep Torrellas",
  title =        "{WiSync}: an Architecture for Fast Synchronization
                 through On-Chip Wireless Communication",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "3--17",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872396",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In shared-memory multiprocessing, fine-grain
                 synchronization is challenging because it requires
                 frequent communication. As technology scaling delivers
                 larger manycore chips, such pattern is expected to
                 remain costly to support. In this paper, we propose to
                 address this challenge by using on-chip wireless
                 communication. Each core has a transceiver and an
                 antenna to communicate with all the other cores. This
                 environment supports very low latency global
                 communication. Our architecture, called WiSync, uses a
                 per-core Broadcast Memory (BM). When a core writes to
                 its BM, all the other 100+ BMs get updated in less than
                 10 processor cycles. We also use a second wireless
                 channel with cheaper transfers to execute barriers
                 efficiently. WiSync supports multiprogramming, virtual
                 memory, and context switching. Our evaluation with
                 simulations of 128-threaded kernels and 64-threaded
                 applications shows that WiSync speeds-up
                 synchronization substantially. Compared to using
                 advanced conventional synchronization, WiSync attains
                 an average speedup of nearly one order of magnitude for
                 the kernels, and 1.12 for PARSEC and SPLASH-2.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Wang:2016:RTE,
  author =       "Xiaodong Wang and Jos{\'e} F. Mart{\'\i}nez",
  title =        "{ReBudget}: Trading Off Efficiency vs. Fairness in
                 Market-Based Multicore Resource Allocation via Runtime
                 Budget Reassignment",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "19--32",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872382",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Efficiently allocating shared resources in computer
                 systems is critical to optimizing execution. Recently,
                 a number of market-based solutions have been proposed
                 to attack this problem. Some of them provide provable
                 theoretical bounds to efficiency and/or fairness losses
                 under market equilibrium. However, they are limited to
                 markets with potentially important constraints, such as
                 enforcing equal budget for all players, or
                 curve-fitting players' utility into a specific function
                 type. Moreover, they do not generally provide an
                 intuitive ``knob'' to control efficiency vs. fairness.
                 In this paper, we introduce two new metrics, Market
                 Utility Range (MUR) and Market Budget Range (MBR),
                 through which we provide for the first time theoretical
                 bounds on efficiency and fairness of market equilibria
                 under arbitrary budget assignments. We leverage this
                 result and propose ReBudget, an iterative budget
                 re-assignment algorithm that can be used to control
                 efficiency vs. fairness at run-time. We apply our
                 algorithm to a multi-resource allocation problem in
                 multicore chips. Our evaluation using detailed
                 execution-driven simulations shows that our budget
                 re-assignment technique is intuitive, effective, and
                 efficient.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Zhu:2016:DEQ,
  author =       "Haishan Zhu and Mattan Erez",
  title =        "Dirigent: Enforcing {QoS} for Latency-Critical Tasks
                 on Shared Multicore Systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "33--47",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872394",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Latency-critical applications suffer from both average
                 performance degradation and reduced completion time
                 predictability when collocated with batch tasks. Such
                 variation forces the system to overprovision resources
                 to ensure Quality of Service (QoS) for latency-critical
                 tasks, degrading overall system throughput. We explore
                 the causes of this variation and exploit the
                 opportunities of mitigating variation directly to
                 simultaneously improve both QoS and utilization. We
                 develop, implement, and evaluate Dirigent, a
                 lightweight performance-management runtime system that
                 accurately controls the QoS of latency-critical
                 applications at fine time scales, leveraging existing
                 architecture mechanisms. We evaluate Dirigent on a real
                 machine and show that it is significantly more
                 effective than configurations representative of prior
                 schemes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Kuperman:2016:PR,
  author =       "Yossi Kuperman and Eyal Moscovici and Joel Nider and
                 Razya Ladelsky and Abel Gordon and Dan Tsafrir",
  title =        "Paravirtual Remote {I/O}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "49--65",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872378",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The traditional ``trap and emulate'' I/O
                 paravirtualization model conveniently allows for I/O
                 interposition, yet it inherently incurs costly
                 guest-host context switches. The newer ``sidecore''
                 model eliminates this overhead by dedicating host
                 (side)cores to poll the relevant guest memory regions
                 and react accordingly without context switching. But
                 the dedication of sidecores on each host might be
                 wasteful when I/O activity is low, or it might not
                 provide enough computational power when I/O activity is
                 high. We propose to alleviate this problem at rack
                 scale by consolidating the dedicated sidecores spread
                 across several hosts onto one server. The hypervisor is
                 then effectively split into two parts: the local
                 hypervisor that hosts the VMs, and the remote
                 hypervisor that processes their paravirtual I/O. We
                 call this model vRIO---paraVirtual Remote I/O. We find
                 that by increasing the latency somewhat, it provides
                 comparable throughput with fewer sidecores and superior
                 throughput with the same number of sidecores as
                 compared to the state of the art. vRIO additionally
                 constitutes a new, cost-effective way to consolidate
                 I/O devices (on the remote hypervisor) while supporting
                 efficient programmable I/O interposition.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Kaufmann:2016:HPP,
  author =       "Antoine Kaufmann and Simon Peter and Naveen Kr. Sharma
                 and Thomas Anderson and Arvind Krishnamurthy",
  title =        "High Performance Packet Processing with {FlexNIC}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "67--81",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872367",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The recent surge of network I/O performance has put
                 enormous pressure on memory and software I/O processing
                 sub systems. We argue that the primary reason for high
                 memory and processing overheads is the inefficient use
                 of these resources by current commodity network
                 interface cards (NICs). We propose FlexNIC, a flexible
                 network DMA interface that can be used by operating
                 systems and applications alike to reduce packet
                 processing overheads. FlexNIC allows services to
                 install packet processing rules into the NIC, which
                 then executes simple operations on packets while
                 exchanging them with host memory. Thus, our proposal
                 moves some of the packet processing traditionally done
                 in software to the NIC, where it can be done flexibly
                 and at high speed. We quantify the potential benefits
                 of FlexNIC by emulating the proposed FlexNIC
                 functionality with existing hardware or in software. We
                 show that significant gains in application performance
                 are possible, in terms of both latency and throughput,
                 for several widely used applications, including a
                 key-value store, a stream processing system, and an
                 intrusion detection system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Bornholt:2016:SCF,
  author =       "James Bornholt and Antoine Kaufmann and Jialin Li and
                 Arvind Krishnamurthy and Emina Torlak and Xi Wang",
  title =        "Specifying and Checking File System Crash-Consistency
                 Models",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "83--98",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872406",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Applications depend on persistent storage to recover
                 state after system crashes. But the POSIX file system
                 interfaces do not define the possible outcomes of a
                 crash. As a result, it is difficult for application
                 writers to correctly understand the ordering of and
                 dependencies between file system operations, which can
                 lead to corrupt application state and, in the worst
                 case, catastrophic data loss. This paper presents
                 crash-consistency models, analogous to memory
                 consistency models, which describe the behavior of a
                 file system across crashes. Crash-consistency models
                 include both litmus tests, which demonstrate allowed
                 and forbidden behaviors, and axiomatic and operational
                 specifications. We present a formal framework for
                 developing crash-consistency models, and a toolkit,
                 called Ferrite, for validating those models against
                 real file system implementations. We develop a
                 crash-consistency model for ext4, and use Ferrite to
                 demonstrate unintuitive crash behaviors of the ext4
                 implementation. To demonstrate the utility of
                 crash-consistency models to application writers, we use
                 our models to prototype proof-of-concept verification
                 and synthesis tools, as well as new library interfaces
                 for crash-safe applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Prasad:2016:PMR,
  author =       "Aravinda Prasad and K. Gopinath",
  title =        "Prudent Memory Reclamation in Procrastination-Based
                 Synchronization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "99--112",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872405",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Procrastination is the fundamental technique used in
                 synchronization mechanisms such as Read-Copy-Update
                 (RCU) where writers, in order to synchronize with
                 readers, defer the freeing of an object until there are
                 no readers referring to the object. The synchronization
                 mechanism determines when the deferred object is safe
                 to reclaim and when it is actually reclaimed. Hence,
                 such memory reclamations are completely oblivious of
                 the memory allocator state. This induces poor memory
                 allocator performance, for instance, when the
                 reclamations are ill-timed. Furthermore, deferred
                 objects provide hints about the future that inform
                 memory regions that are about to be freed. Although
                 useful, hints are not exploited as deferred objects are
                 not visible to memory allocators. We introduce
                 Prudence, a dynamic memory allocator, that is tightly
                 integrated with the synchronization mechanism to ensure
                 visibility of deferred objects to the memory allocator.
                 Such an integration enables Prudence to (i) identify
                 the safe time to reclaim deferred objects' memory, (ii)
                 have an inclusive view of the allocated, free and
                 about-to-be-freed objects, and (iii) exploit
                 optimizations based on the hints about the future
                 during important state transitions. Our evaluation in
                 the Linux kernel shows that Prudence integrated with
                 RCU performs 3.9X to 28X better in micro-benchmarks
                 compared to SLUB, a recent memory allocator in the
                 Linux kernel. It also improves the overall performance
                 perceptibly (4\%-18\%) for a mix of widely used
                 synthetic and application benchmarks. Further, it
                 performs better (up to 98\%) in terms of object hits in
                 caches, object cache churns, slab churns, peak memory
                 usage and total fragmentation, when compared with the
                 SLUB allocator.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Mukkara:2016:WID,
  author =       "Anurag Mukkara and Nathan Beckmann and Daniel
                 Sanchez",
  title =        "{Whirlpool}: Improving Dynamic Cache Management with
                 Static Data Classification",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "113--127",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872363",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cache hierarchies are increasingly non-uniform and
                 difficult to manage. Several techniques, such as
                 scratchpads or reuse hints, use static information
                 about how programs access data to manage the memory
                 hierarchy. Static techniques are effective on regular
                 programs, but because they set fixed policies, they are
                 vulnerable to changes in program behavior or available
                 cache space. Instead, most systems rely on dynamic
                 caching policies that adapt to observed program
                 behavior. Unfortunately, dynamic policies spend
                 significant resources trying to learn how programs use
                 memory, and yet they often perform worse than a static
                 policy. We present Whirlpool, a novel approach that
                 combines static information with dynamic policies to
                 reap the benefits of each. Whirlpool statically
                 classifies data into pools based on how the program
                 uses memory. Whirlpool then uses dynamic policies to
                 tune the cache to each pool. Hence, rather than setting
                 policies statically, Whirlpool uses static analysis to
                 guide dynamic policies. We present both an API that
                 lets programmers specify pools manually and a profiling
                 tool that discovers pools automatically in unmodified
                 binaries. We evaluate Whirlpool on a state-of-the-art
                 NUCA cache. Whirlpool significantly outperforms prior
                 approaches: on sequential programs, Whirlpool improves
                 performance by up to 38\% and reduces data movement
                 energy by up to 53\%; on parallel programs, Whirlpool
                 improves performance by up to 67\% and reduces data
                 movement energy by up to 2.6x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Jeon:2016:TTD,
  author =       "Myeongjae Jeon and Yuxiong He and Hwanju Kim and Sameh
                 Elnikety and Scott Rixner and Alan L. Cox",
  title =        "{TPC}: Target-Driven Parallelism Combining Prediction
                 and Correction to Reduce Tail Latency in Interactive
                 Services",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "129--141",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872370",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In interactive services such as web search,
                 recommendations, games and finance, reducing the tail
                 latency is crucial to provide fast response to every
                 user. Using web search as a driving example, we
                 systematically characterize interactive workload to
                 identify the opportunities and challenges for reducing
                 tail latency. We find that the workload consists of
                 mainly short requests that do not benefit from
                 parallelism, and a few long requests which
                 significantly impact the tail but exhibit high
                 parallelism speedup. This motivates estimating request
                 execution time, using a predictor, to identify long
                 requests and to parallelize them. Prediction, however,
                 is not perfect; a long request mispredicted as short is
                 likely to contribute to the server tail latency,
                 setting a ceiling on the achievable tail latency. We
                 propose TPC, an approach that combines prediction
                 information judiciously with dynamic correction for
                 inaccurate prediction. Dynamic correction increases
                 parallelism to accelerate a long request that is
                 mispredicted as short. TPC carefully selects the
                 appropriate target latencies based on system load and
                 parallelism efficiency to reduce tail latency. We
                 implement TPC and several prior approaches to compare
                 them experimentally on a single search server and on a
                 cluster of 40 search servers. The experimental results
                 show that TPC reduces the 99th- and 99.9th-percentile
                 latency by up to 40\% compared with the best prior
                 work. Moreover, we evaluate TPC on a finance server,
                 demonstrating its effectiveness on reducing tail
                 latency of interactive services beyond web search.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Brown:2016:HBS,
  author =       "Fraser Brown and Andres N{\"o}tzli and Dawson Engler",
  title =        "How to Build Static Checking Systems Using Orders of
                 Magnitude Less Code",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "143--157",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872364",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern static bug finding tools are complex. They
                 typically consist of hundreds of thousands of lines of
                 code, and most of them are wedded to one language (or
                 even one compiler). This complexity makes the systems
                 hard to understand, hard to debug, and hard to retarget
                 to new languages, thereby dramatically limiting their
                 scope. This paper reduces checking system complexity by
                 addressing a fundamental assumption, the assumption
                 that checkers must depend on a full-blown language
                 specification and compiler front end. Instead, our
                 program checkers are based on drastically incomplete
                 language grammars (``micro-grammars'') that describe
                 only portions of a language relevant to a checker. As a
                 result, our implementation is tiny-roughly 2500 lines
                 of code, about two orders of magnitude smaller than a
                 typical system. We hope that this dramatic increase in
                 simplicity will allow people to use more checkers on
                 more systems in more languages. We implement our
                 approach in $ \mu $ chex, a language-agnostic framework
                 for writing static bug checkers. We use it to build
                 micro-grammar based checkers for six languages (C, the
                 C preprocessor, C++, Java, JavaScript, and Dart) and
                 find over 700 errors in real-world projects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Zhang:2016:TED,
  author =       "Tong Zhang and Dongyoon Lee and Changhee Jung",
  title =        "{TxRace}: Efficient Data Race Detection Using
                 Commodity Hardware Transactional Memory",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "159--173",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872384",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Detecting data races is important for debugging
                 shared-memory multithreaded programs, but the high
                 runtime overhead prevents the wide use of dynamic data
                 race detectors. This paper presents TxRace, a new
                 software data race detector that leverages commodity
                 hardware transactional memory (HTM) to speed up data
                 race detection. TxRace instruments a multithreaded
                 program to transform synchronization-free regions into
                 transactions, and exploits the conflict detection
                 mechanism of HTM for lightweight data race detection at
                 runtime. However, the limitations of the current
                 best-effort commodity HTMs expose several challenges in
                 using them for data race detection: (1) lack of ability
                 to pinpoint racy instructions, (2) false positives
                 caused by cache line granularity of conflict detection,
                 and (3) transactional aborts for non-conflict reasons
                 (e.g., capacity or unknown). To overcome these
                 challenges, TxRace performs lightweight HTM-based data
                 race detection at first, and occasionally switches to
                 slow yet precise data race detection only for the small
                 fraction of execution intervals in which potential
                 races are reported by HTM. According to the
                 experimental results, TxRace reduces the average
                 runtime overhead of dynamic data race detection from
                 11.68x to 4.65x with only a small number of false
                 negatives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Amani:2016:CVH,
  author =       "Sidney Amani and Alex Hixon and Zilin Chen and
                 Christine Rizkallah and Peter Chubb and Liam O'Connor
                 and Joel Beeren and Yutaka Nagashima and Japheth Lim
                 and Thomas Sewell and Joseph Tuong and Gabriele Keller
                 and Toby Murray and Gerwin Klein and Gernot Heiser",
  title =        "{CoGENT}: Verifying High-Assurance File System
                 Implementations",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "175--188",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872404",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an approach to writing and formally
                 verifying high-assurance file-system code in a
                 restricted language called COGENT, supported by a
                 certifying compiler that produces C code, high-level
                 specification of COGENT, and translation correctness
                 proofs. The language is strongly typed and guarantees
                 absence of a number of common file system
                 implementation errors. We show how verification effort
                 is drastically reduced for proving higher-level
                 properties of the file system implementation by
                 reasoning about the generated formal specification
                 rather than its low-level C code. We use the framework
                 to write two Linux file systems, and compare their
                 performance with their native C implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Asmussen:2016:MHO,
  author =       "Nils Asmussen and Marcus V{\"o}lp and Benedikt
                 N{\"o}then and Hermann H{\"a}rtig and Gerhard
                 Fettweis",
  title =        "{M3}: a Hardware\slash Operating-System Co-Design to
                 Tame Heterogeneous Manycores",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "189--203",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872371",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the last decade, the number of available cores
                 increased and heterogeneity grew. In this work, we ask
                 the question whether the design of the current
                 operating systems (OSes) is still appropriate if these
                 trends continue and lead to abundantly available but
                 heterogeneous cores, or whether it forces a fundamental
                 rethinking of how systems are designed. We argue that:
                 1. hiding heterogeneity behind a common hardware
                 interface unifies, to a large extent, the control and
                 coordination of cores and accelerators in the OS, 2.
                 isolating at the network-on-chip rather than with
                 processor features (like privileged mode, memory
                 management unit, ...), allows running untrusted code on
                 arbitrary cores, and 3. providing OS services via
                 protocols over the network-on-chip, instead of via
                 system calls, makes them accessible to arbitrary types
                 of cores as well. In summary, this turns accelerators
                 into first-class citizens and enables a single and
                 convenient programming environment for all cores
                 without the need to trust any application. In this
                 paper, we introduce network-on-chip-level isolation,
                 present the design of our microkernel-based OS, M3, and
                 the common hardware interface, and evaluate the
                 performance of our prototype in comparison to Linux. A
                 bit surprising, without using accelerators, M3
                 outperforms Linux in some application-level benchmarks
                 by more than a factor of five.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Liaqat:2016:SEE,
  author =       "Daniyal Liaqat and Silviu Jingoi and Eyal de Lara and
                 Ashvin Goel and Wilson To and Kevin Lee and Italo {De
                 Moraes Garcia} and Manuel Saldana",
  title =        "{Sidewinder}: an Energy Efficient and Developer
                 Friendly Heterogeneous Architecture for Continuous
                 Mobile Sensing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "205--215",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872398",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Applications that perform continuous sensing on mobile
                 phones have the potential to revolutionize everyday
                 life. Examples range from medical and health monitoring
                 applications, such as pedometers and fall detectors, to
                 participatory sensing applications, such as noise
                 pollution, traffic and seismic activity monitoring.
                 Unfortunately, current mobile devices are a poor match
                 for continuous sensing applications as they require the
                 device to remain awake for extended periods of time,
                 resulting in poor battery life. This paper presents
                 Sidewinder, a new approach towards offloading sensor
                 data processing to a low-power processor and waking up
                 the main processor when events of interest occur. This
                 approach differs from other heterogeneous architectures
                 in that developers are presented with a programming
                 interface that lets them construct application specific
                 wake-up conditions by linking together and
                 parameterizing predefined sensor data processing
                 algorithms. Our experiments indicate performance that
                 is comparable to approaches that provide fully
                 programmable offloading, but do so with a much simpler
                 programming interface that facilitates deployment and
                 portability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Balkind:2016:OOS,
  author =       "Jonathan Balkind and Michael McKeown and Yaosheng Fu
                 and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and
                 Mohammad Shahrad and Adi Fuchs and Samuel Payne and
                 Xiaohua Liang and Matthew Matl and David Wentzlaff",
  title =        "{OpenPiton}: an Open Source Manycore Research
                 Framework",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "217--232",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872414",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Industry is building larger, more complex, manycore
                 processors on the back of strong institutional
                 knowledge, but academic projects face difficulties in
                 replicating that scale. To alleviate these difficulties
                 and to develop and share knowledge, the community needs
                 open architecture frameworks for simulation, synthesis,
                 and software exploration which support extensibility,
                 scalability, and configurability, alongside an
                 established base of verification tools and supported
                 software. In this paper we present OpenPiton, an open
                 source framework for building scalable architecture
                 research prototypes from 1 core to 500 million cores.
                 OpenPiton is the world's first open source,
                 general-purpose, multithreaded manycore processor and
                 framework. OpenPiton leverages the industry hardened
                 OpenSPARC T1 core with modifications and builds upon it
                 with a scratch-built, scalable uncore creating a
                 flexible, modern manycore design. In addition,
                 OpenPiton provides synthesis and backend scripts for
                 ASIC and FPGA to enable other researchers to bring
                 their designs to implementation. OpenPiton provides a
                 complete verification infrastructure of over 8000
                 tests, is supported by mature software tools, runs
                 full-stack multiuser Debian Linux, and is written in
                 industry standard Verilog. Multiple implementations of
                 OpenPiton have been created including a taped-out
                 25-core implementation in IBM's 32nm process and
                 multiple Xilinx FPGA prototypes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Lustig:2016:CVM,
  author =       "Daniel Lustig and Geet Sethi and Margaret Martonosi
                 and Abhishek Bhattacharjee",
  title =        "{COATCheck}: Verifying Memory Ordering at the
                 Hardware-{OS} Interface",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "233--247",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872399",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern computer systems include numerous compute
                 elements, from CPUs to GPUs to accelerators. Harnessing
                 their full potential requires well-defined,
                 properly-implemented memory consistency models (MCMs),
                 and low-level system functionality such as virtual
                 memory and address translation (AT). Unfortunately, it
                 is difficult to specify and implement hardware-OS
                 interactions correctly; in the past, many hardware and
                 OS specification mismatches have resulted in
                 implementation bugs in commercial processors. In an
                 effort to resolve this verification gap, this paper
                 makes the following contributions. First, we present
                 COATCheck, an address translation-aware framework for
                 specifying and statically verifying memory ordering
                 enforcement at the microarchitecture and operating
                 system levels. We develop a domain-specific language
                 for specifying ordering enforcement, for including
                 ordering-related OS events and hardware
                 micro-operations, and for programmatically enumerating
                 happens-before graphs. Using a fast and automated
                 static constraint solver, COATCheck can efficiently
                 analyze interesting and important memory ordering
                 scenarios for modern, high-performance, out-of-order
                 processors. Second, we show that previous work on
                 Virtual Address Memory Consistency (VAMC) does not
                 capture every translation-related ordering scenario of
                 interest, and that some such cases even fall outside
                 the traditional scope of consistency. We therefore
                 introduce the term transistency model to describe the
                 superset of consistency which captures all
                 translation-aware sets of ordering rules.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Markuze:2016:TIP,
  author =       "Alex Markuze and Adam Morrison and Dan Tsafrir",
  title =        "True {IOMMU} Protection from {DMA} Attacks: When Copy
                 is Faster than Zero Copy",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "249--262",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872379",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Malicious I/O devices might compromise the OS using
                 DMAs. The OS therefore utilizes the IOMMU to map and
                 unmap every target buffer right before and after its
                 DMA is processed, thereby restricting DMAs to their
                 designated locations. This usage model, however, is not
                 truly secure for two reasons: (1) it provides
                 protection at page granularity only, whereas DMA
                 buffers can reside on the same page as other data; and
                 (2) it delays DMA buffer unmaps due to performance
                 considerations, creating a vulnerability window in
                 which devices can access in-use memory. We propose that
                 OSes utilize the IOMMU differently, in a manner that
                 eliminates these two flaws. Our new usage model
                 restricts device access to a set of shadow DMA buffers
                 that are never unmapped, and it copies DMAed data
                 to/from these buffers, thus providing sub-page
                 protection while eliminating the aforementioned
                 vulnerability window. Our key insight is that the cost
                 of interacting with, and synchronizing access to the
                 slow IOMMU hardware---required for zero-copy protection
                 against devices---make copying preferable to
                 zero-copying. We implement our model in Linux and
                 evaluate it with standard networking benchmarks
                 utilizing a 40,Gb/s NIC. We demonstrate that despite
                 being more secure than the safest preexisting usage
                 model, our approach provides up to 5x higher
                 throughput. Additionally, whereas it is inherently less
                 scalable than an IOMMU-less (unprotected) system, our
                 approach incurs only 0\%--25\% performance degradation
                 in comparison.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Awad:2016:SSZ,
  author =       "Amro Awad and Pratyusa Manadhata and Stuart Haber and
                 Yan Solihin and William Horne",
  title =        "Silent Shredder: Zero-Cost Shredding for Secure
                 Non-Volatile Main Memory Controllers",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "263--276",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872377",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As non-volatile memory (NVM) technologies are expected
                 to replace DRAM in the near future, new challenges have
                 emerged. For example, NVMs have slow and
                 power-consuming writes, and limited write endurance. In
                 addition, NVMs have a data remanence vulnerability,
                 i.e., they retain data for a long time after being
                 powered off. NVM encryption alleviates the
                 vulnerability, but exacerbates the limited endurance by
                 increasing the number of writes to memory. We observe
                 that, in current systems, a large percentage of main
                 memory writes result from data shredding in operating
                 systems, a process of zeroing out physical pages before
                 mapping them to new processes, in order to protect
                 previous processes' data. In this paper, we propose
                 Silent Shredder, which repurposes initialization
                 vectors used in standard counter mode encryption to
                 completely eliminate the data shredding writes. Silent
                 Shredder also speeds up reading shredded cache lines,
                 and hence reduces power consumption and improves
                 overall performance. To evaluate our design, we run
                 three PowerGraph applications and 26 multi-programmed
                 workloads from the SPEC 2006 suite, on a gem5-based
                 full system simulator. Silent Shredder eliminates an
                 average of 48.6\% of the writes in the initialization
                 and graph construction phases. It speeds up main memory
                 reads by 3.3 times, and improves the number of
                 instructions per cycle (IPC) by 6.4\% on average.
                 Finally, we discuss several use cases, including
                 virtual machines' data isolation and user-level large
                 data initialization, where Silent Shredder can be used
                 effectively at no extra cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Kwon:2016:SPT,
  author =       "Youngjin Kwon and Alan M. Dunn and Michael Z. Lee and
                 Owen S. Hofmann and Yuanzhong Xu and Emmett Witchel",
  title =        "{Sego}: Pervasive Trusted Metadata for Efficiently
                 Verified Untrusted System Services",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "277--290",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872372",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Sego is a hypervisor-based system that gives strong
                 privacy and integrity guarantees to trusted
                 applications, even when the guest operating system is
                 compromised or hostile. Sego verifies operating system
                 services, like the file system, instead of replacing
                 them. By associating trusted metadata with user data
                 across all system devices, Sego verifies system
                 services more efficiently than previous systems,
                 especially services that depend on data contents. We
                 extensively evaluate Sego's performance on real
                 workloads and implement a kernel fault injector to
                 validate Sego's file system-agnostic crash consistency
                 and recovery protocol.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Tsafrir:2016:SAW,
  author =       "Dan Tsafrir",
  title =        "Synopsis of the {ASPLOS '16 Wild and Crazy Ideas
                 (WACI)} Invited-Speakers Session",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "291--294",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2876512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Wild and Crazy Ideas (WACI) session is a
                 longstanding tradition at ASPLOS, soliciting talks that
                 consist of forward-looking, visionary, inspiring,
                 creative, far out or just plain amazing ideas presented
                 in an exciting way. (Amusing elements in the
                 presentations are tolerated ;-) but are in fact
                 optional.) The first WACI session took place in 1998.
                 Back then, the call for talks included a problem
                 statement, which contended that ``papers usually do not
                 get admitted to [such conferences as] ISCA or ASPLOS
                 unless the systems that they describe are mature enough
                 to run [some standard benchmark suites, which] has a
                 chilling effect on the idea generation
                 process---encouraging incremental research'' [1]. The
                 1998 WACI session turned out to be a great success. Its
                 webpage states that ``there were 42 submissions
                 [competing over] only eight time slots, [which resulted
                 in] this session [having] a lower acceptance rate than
                 the conference itself'' [2]. But the times they are
                 a-changin' [3], and the WACI session no longer enjoys
                 that many submissions (Figure 1), perhaps because
                 nowadays there exist many forums for researchers to
                 describe/discuss their preliminary ideas, including:
                 the ``hot topics in'' workshops [4--7]; a journal like
                 CAL, dedicated to early results [8]; main conferences
                 soliciting short submissions describing ``original or
                 unconventional ideas at a preliminary stage'' in
                 addition to regular papers [9]; and the many workshops
                 co-located with main conferences, like ISCA '15, which
                 hosted thirteen such workshops [10]. Regardless of the
                 reason for the declining number of submissions, this
                 time we've decided to organize the WACI session
                 differently to ensure its continued high quality.
                 Instead of soliciting talks via an open call and hoping
                 for the best, we proactively invited speakers whom we
                 believe are capable of delivering excellent WACI
                 presentations. That is, this year's WACI session
                 consists exclusively of invited speakers. Filling up
                 the available slots turned out to be fairly easy, as
                 most of the researchers we invited promptly accepted
                 our invitation. The duration of each talk was set to be
                 eight minutes (exactly as in the first WACI session
                 from 1998) plus two minutes for questions. The talks
                 are outlined below. We believe they are interesting and
                 exciting, and we hope the attendees of the session will
                 find them stimulating and insightful.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Williams:2016:BIC,
  author =       "R. Stanley Williams",
  title =        "Brain Inspired Computing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "295--295",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872417",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Phothilimthana:2016:SS,
  author =       "Phitchaya Mangpo Phothilimthana and Aditya Thakur and
                 Rastislav Bodik and Dinakar Dhurjati",
  title =        "Scaling up Superoptimization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "297--310",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872387",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Developing a code optimizer is challenging, especially
                 for new, idiosyncratic ISAs. Superoptimization can, in
                 principle, discover machine-specific optimizations
                 automatically by searching the space of all instruction
                 sequences. If we can increase the size of code
                 fragments a superoptimizer can optimize, we will be
                 able to discover more optimizations. We develop LENS, a
                 search algorithm that increases the size of code a
                 superoptimizer can synthesize by rapidly pruning away
                 invalid candidate programs. Pruning is achieved by
                 selectively refining the abstraction under which
                 candidates are considered equivalent, only in the
                 promising part of the candidate space. LENS also uses a
                 bidirectional search strategy to prune the candidate
                 space from both forward and backward directions. These
                 pruning strategies allow LENS to solve twice as many
                 benchmarks as existing enumerative search algorithms,
                 while LENS is about 11-times faster. Additionally, we
                 increase the effective size of the superoptimized
                 fragments by relaxing the correctness condition using
                 contexts (surrounding code). Finally, we combine LENS
                 with complementary search techniques into a cooperative
                 superoptimizer, which exploits the stochastic search to
                 make random jumps in a large candidate space, and a
                 symbolic (SAT-solver-based) search to synthesize
                 arbitrary constants. While existing superoptimizers
                 consistently solve 9--16 out of 32 benchmarks, the
                 cooperative superoptimizer solves 29 benchmarks. It can
                 synthesize code fragments that are up to 82\% faster
                 than code generated by gcc -O3 from WiBench and
                 MiBench.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Hasabnis:2016:LAI,
  author =       "Niranjan Hasabnis and R. Sekar",
  title =        "Lifting Assembly to Intermediate Representation: a
                 Novel Approach Leveraging Compilers",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "311--324",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872380",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Translating low-level machine instructions into
                 higher-level intermediate language (IL) is one of the
                 central steps in many binary analysis and
                 instrumentation systems. Existing systems build such
                 translators manually. As a result, it takes a great
                 deal of effort to support new architectures. Even for
                 widely deployed architectures, full instruction sets
                 may not be modeled, e.g., mature systems such as
                 Valgrind still lack support for AVX, FMA4 and SSE4.1
                 for x86 processors. To overcome these difficulties, we
                 propose a novel approach that leverages knowledge about
                 instruction set semantics that is already embedded into
                 modern compilers such as GCC. In particular, we present
                 a learning-based approach for automating the
                 translation of assembly instructions to a compiler's
                 architecture-neutral IL. We present an experimental
                 evaluation that demonstrates the ability of our
                 approach to easily support many architectures (x86, ARM
                 and AVR), including their advanced instruction sets.
                 Our implementation is available as open-source
                 software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Muralidharan:2016:AAC,
  author =       "Saurav Muralidharan and Amit Roy and Mary Hall and
                 Michael Garland and Piyush Rai",
  title =        "Architecture-Adaptive Code Variant Tuning",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "325--338",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872411",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Code variants represent alternative implementations of
                 a computation, and are common in high-performance
                 libraries and applications to facilitate selecting the
                 most appropriate implementation for a specific
                 execution context (target architecture and input
                 dataset). Automating code variant selection typically
                 relies on machine learning to construct a model during
                 an offline learning phase that can be quickly queried
                 at runtime once the execution context is known. In this
                 paper, we define a new approach called
                 architecture-adaptive code variant tuning, where the
                 variant selection model is learned on a set of source
                 architectures, and then used to predict variants on a
                 new target architecture without having to repeat the
                 training process. We pose this as a multi-task learning
                 problem, where each source architecture corresponds to
                 a task; we use device features in the construction of
                 the variant selection model. This work explores the
                 effectiveness of multi-task learning and the impact of
                 different strategies for device feature selection. We
                 evaluate our approach on a set of benchmarks and a
                 collection of six NVIDIA GPU architectures from three
                 distinct generations. We achieve performance results
                 that are mostly comparable to the previous approach of
                 tuning for a single GPU architecture without having to
                 repeat the learning phase.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Lin:2016:SKT,
  author =       "Xiaofeng Lin and Yu Chen and Xiaodong Li and Junjie
                 Mao and Jiaquan He and Wei Xu and Yuanchun Shi",
  title =        "Scalable Kernel {TCP} Design and Implementation for
                 Short-Lived Connections",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "339--352",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872391",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the rapid growth of network bandwidth, increases
                 in CPU cores on a single machine, and application API
                 models demanding more short-lived connections, a
                 scalable TCP stack is performance-critical. Although
                 many clean-state designs have been proposed, production
                 environments still call for a bottom-up parallel TCP
                 stack design that is backward-compatible with existing
                 applications. We present Fastsocket, a BSD
                 Socket-compatible and scalable kernel socket design,
                 which achieves table-level connection partition in TCP
                 stack and guarantees connection locality for both
                 passive and active connections. Fastsocket architecture
                 is a ground up partition design, from NIC interrupts
                 all the way up to applications, which naturally
                 eliminates various lock contentions in the entire
                 stack. Moreover, Fastsocket maintains the full
                 functionality of the kernel TCP stack and
                 BSD-socket-compatible API, and thus applications need
                 no modifications. Our evaluations show that Fastsocket
                 achieves a speedup of 20.4x on a 24-core machine under
                 a workload of short-lived connections, outperforming
                 the state-of-the-art Linux kernel TCP implementations.
                 When scaling up to 24 CPU cores, Fastsocket increases
                 the throughput of Nginx and HAProxy by 267\% and 621\%
                 respectively compared with the base Linux kernel. We
                 also demonstrate that Fastsocket can achieve
                 scalability and preserve BSD socket API at the same
                 time. Fastsocket is already deployed in the production
                 environment of Sina WeiBo, serving 50 million daily
                 active users and billions of requests per day.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{ElHajj:2016:SPM,
  author =       "Izzat {El Hajj} and Alexander Merritt and Gerd
                 Zellweger and Dejan Milojicic and Reto Achermann and
                 Paolo Faraboschi and Wen-mei Hwu and Timothy Roscoe and
                 Karsten Schwan",
  title =        "{SpaceJMP}: Programming with Multiple Virtual Address
                 Spaces",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "353--368",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872366",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Memory-centric computing demands careful organization
                 of the virtual address space, but traditional methods
                 for doing so are inflexible and inefficient. If an
                 application wishes to address larger physical memory
                 than virtual address bits allow, if it wishes to
                 maintain pointer-based data structures beyond process
                 lifetimes, or if it wishes to share large amounts of
                 memory across simultaneously executing processes,
                 legacy interfaces for managing the address space are
                 cumbersome and often incur excessive overheads. We
                 propose a new operating system design that promotes
                 virtual address spaces to first-class citizens,
                 enabling process threads to attach to, detach from, and
                 switch between multiple virtual address spaces. Our
                 work enables data-centric applications to utilize vast
                 physical memory beyond the virtual range, represent
                 persistent pointer-rich data structures without special
                 pointer representations, and share large amounts of
                 memory between processes efficiently. We describe our
                 prototype implementations in the DragonFly BSD and
                 Barrelfish operating systems. We also present
                 programming semantics and a compiler transformation to
                 detect unsafe pointer usage. We demonstrate the
                 benefits of our work on data-intensive applications
                 such as the GUPS benchmark, the SAMTools genomics
                 workflow, and the Redis key-value store.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Lin:2016:MTP,
  author =       "Felix Xiaozhu Lin and Xu Liu",
  title =        "{\tt memif}: Towards Programming Heterogeneous Memory
                 Asynchronously",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "369--383",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872401",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To harness a heterogeneous memory hierarchy, it is
                 advantageous to integrate application knowledge in
                 guiding frequent memory move, i.e., replicating or
                 migrating virtual memory regions. To this end, we
                 present memif, a protected OS service for asynchronous,
                 hardware-accelerated memory move. Compared to the state
                 of the art --- page migration in Linux, memif incurs
                 low overhead and low latency; in order to do so, it not
                 only redefines the semantics of kernel interface but
                 also overhauls the underlying mechanisms, including
                 request/completion management, race handling, and DMA
                 engine configuration. We implement memif in Linux for a
                 server-class system-on-chip that features heterogeneous
                 memories. Compared to the current Linux page migration,
                 memif reduces CPU usage by up to 15\% for small pages
                 and by up to 38x for large pages; in continuously
                 serving requests, memif has no need for request
                 batching and reduces latency by up to 63\%. By crafting
                 a small runtime atop memif, we improve the throughputs
                 for a set of streaming workloads by up to 33\%.
                 Overall, memif has opened the door to software
                 management of heterogeneous memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Kim:2016:NEN,
  author =       "Wook-Hee Kim and Jinwoong Kim and Woongki Baek and
                 Beomseok Nam and Youjip Won",
  title =        "{NVWAL}: Exploiting {NVRAM} in Write-Ahead Logging",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "385--398",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872392",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging byte-addressable non-volatile memory is
                 considered an alternative storage device for database
                 logs that require persistency and high performance. In
                 this work, we develop NVWAL (NVRAM Write-Ahead Logging)
                 for SQLite. The contribution of NVWAL consists of three
                 elements: (i) byte-granularity differential logging
                 that effectively eliminates the excessive I/O overhead
                 of filesystem-based logging or journaling, (ii)
                 transaction-aware lazy synchronization that reduces
                 cache synchronization overhead by two-thirds, and (iii)
                 user-level heap management of the NVRAM persistent WAL
                 structure, which reduces the overhead of managing
                 persistent objects. We implemented NVWAL in SQLite and
                 measured the performance on a Nexus 5 smartphone and an
                 NVRAM emulation board --- Tuna. Our performance study
                 shows the following: (i) the overhead of enforcing
                 strict ordering of NVRAM writes can be reduced via
                 NVRAM-aware transaction management. (ii) From the
                 application performance point of view, the overhead of
                 guaranteeing failure atomicity is negligible; the cache
                 line flush overhead accounts for only 0.8~4.6\% of
                 transaction execution time. Therefore, application
                 performance is much less sensitive to the NVRAM
                 performance than we expected. Decreasing the NVRAM
                 latency by one-fifth (from 1942 nsec to 437 nsec),
                 SQLite achieves a mere 4\% performance gain (from 2517
                 ins/sec to 2621 ins/sec). (iii) Overall, when the write
                 latency of NVRAM is 2 usec, NVWAL increases SQLite
                 performance by at least 10x compared to that of WAL on
                 flash memory (from 541 ins/sec to 5812 ins/sec).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Kolli:2016:HPT,
  author =       "Aasheesh Kolli and Steven Pelley and Ali Saidi and
                 Peter M. Chen and Thomas F. Wenisch",
  title =        "High-Performance Transactions for Persistent
                 Memories",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "399--411",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872381",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging non-volatile memory (NVRAM) technologies
                 offer the durability of disk with the
                 byte-addressability of DRAM. These devices will allow
                 software to access persistent data structures directly
                 in NVRAM using processor loads and stores, however,
                 ensuring consistency of persistent data across power
                 failures and crashes is difficult. Atomic, durable
                 transactions are a widely used abstraction to enforce
                 such consistency. Implementing transactions on NVRAM
                 requires the ability to constrain the order of NVRAM
                 writes, for example, to ensure that a transaction's log
                 record is complete before it is marked committed. Since
                 NVRAM write latencies are expected to be high,
                 minimizing these ordering constraints is critical for
                 achieving high performance. Recent work has proposed
                 programming interfaces to express NVRAM write ordering
                 constraints to hardware so that NVRAM writes may be
                 coalesced and reordered while preserving necessary
                 constraints. Unfortunately, a straightforward
                 implementation of transactions under these interfaces
                 imposes unnecessary constraints. We show how to remove
                 these dependencies through a variety of techniques,
                 notably, deferring commit until after locks are
                 released. We present a comprehensive analysis
                 contrasting two transaction designs across three NVRAM
                 programming interfaces, demonstrating up to 2.5x
                 speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Guo:2016:HDI,
  author =       "Qing Guo and Karin Strauss and Luis Ceze and Henrique
                 S. Malvar",
  title =        "High-Density Image Storage Using Approximate Memory
                 Cells",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "413--426",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872413",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper proposes tailoring image encoding for an
                 approximate storage substrate. We demonstrate that
                 indiscriminately storing encoded images in approximate
                 memory generates unacceptable and uncontrollable
                 quality degradation. The key finding is that errors in
                 the encoded bit streams have non-uniform impact on the
                 decoded image quality. We develop a methodology to
                 determine the relative importance of encoded bits and
                 store them in an approximate storage substrate. The
                 storage cells are optimized to reduce error rate via
                 biasing and are tuned to meet the desired reliability
                 requirement via selective error correction. In a case
                 study with the progressive transform codec (PTC), a
                 precursor to JPEG XR, the proposed approximate image
                 storage system exhibits a 2.7x increase in density of
                 pixels per silicon volume under bounded error rates,
                 and this achievement is additive to the storage savings
                 of PTC compression.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Izraelevitz:2016:FAP,
  author =       "Joseph Izraelevitz and Terence Kelly and Aasheesh
                 Kolli",
  title =        "Failure-Atomic Persistent Memory Updates via {JUSTDO}
                 Logging",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "427--442",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872410",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Persistent memory invites applications to manipulate
                 persistent data via load and store instructions.
                 Because failures during updates may destroy transient
                 data (e.g., in CPU registers), preserving data
                 integrity in the presence of failures requires
                 failure-atomic bundles of updates. Prior failure
                 atomicity approaches for persistent memory entail
                 overheads due to logging and CPU cache flushing.
                 Persistent caches can eliminate the need for flushing,
                 but conventional logging remains complex and memory
                 intensive. We present the design and implementation of
                 JUSTDO logging, a new failure atomicity mechanism that
                 greatly reduces the memory footprint of logs,
                 simplifies log management, and enables fast parallel
                 recovery following failure. Crash-injection tests
                 confirm that JUSTDO logging preserves application data
                 integrity and performance evaluations show that it
                 improves throughput 3x or more compared with a
                 state-of-the-art alternative for a spectrum of
                 data-intensive algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Han:2016:IMD,
  author =       "Jaeung Han and Seungheun Jeon and Young-ri Choi and
                 Jaehyuk Huh",
  title =        "Interference Management for Distributed Parallel
                 Applications in Consolidated Clusters",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "443--456",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872388",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Consolidating multiple applications on a system can
                 improve the overall resource utilization of data center
                 systems. However, such consolidation can adversely
                 affect the performance of some applications due to
                 interference caused by resource contention. Despite
                 many prior studies on the interference effects in
                 single-node systems, the interference behaviors of
                 distributed parallel applications have not been
                 investigated thoroughly. With distributed applications,
                 a local interference in a node can affect the whole
                 execution of an application spanning many nodes. This
                 paper studies an interference modeling methodology for
                 distributed applications to predict their performance
                 under interference effects in consolidated clusters.
                 This study first characterizes the effects of
                 interference for various distributed applications over
                 different interference settings, and analyzes how
                 diverse interference intensities on multiple nodes
                 affect the overall performance. Based on the
                 characterization, this study proposes a static
                 profiling-based model for interference propagation and
                 heterogeneity behaviors. In addition, this paper
                 presents use case studies of the modeling method, two
                 interference-aware placement techniques for
                 consolidated virtual clusters, which attempt to
                 maximize the overall throughput or to guarantee the
                 quality-of-service.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Maas:2016:THL,
  author =       "Martin Maas and Krste Asanovi{\'c} and Tim Harris and
                 John Kubiatowicz",
  title =        "{Taurus}: a Holistic Language Runtime System for
                 Coordinating Distributed Managed-Language
                 Applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "457--471",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872386",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many distributed workloads in today's data centers are
                 written in managed languages such as Java or Ruby.
                 Examples include big data frameworks such as Hadoop,
                 data stores such as Cassandra or applications such as
                 the SOLR search engine. These workloads typically run
                 across many independent language runtime systems on
                 different nodes. This setup represents a source of
                 inefficiency, as these language runtime systems are
                 unaware of each other. For example, they may perform
                 Garbage Collection at times that are locally reasonable
                 but not in a distributed setting. We address these
                 problems by introducing the concept of a Holistic
                 Runtime System that makes runtime-level decisions for
                 the entire distributed application rather than locally.
                 We then present Taurus, a Holistic Runtime System
                 prototype. Taurus is a JVM drop-in replacement,
                 requires almost no configuration and can run unmodified
                 off-the-shelf Java applications. Taurus enforces
                 user-defined coordination policies and provides a DSL
                 for writing these policies. By applying Taurus to
                 Garbage Collection, we demonstrate the potential of
                 such a system and use it to explore coordination
                 strategies for the runtime systems of real-world
                 distributed applications, to improve application
                 performance and address tail-latencies in
                 latency-sensitive workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Delimitrou:2016:HRE,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "{HCloud}: Resource-Efficient Provisioning in Shared
                 Cloud Systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "473--488",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872365",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cloud computing promises flexibility and high
                 performance for users and cost efficiency for
                 operators. To achieve this, cloud providers offer
                 instances of different sizes, both as long-term
                 reservations and short-term, on-demand allocations.
                 Unfortunately, determining the best provisioning
                 strategy is a complex, multi-dimensional problem that
                 depends on the load fluctuation and duration of
                 incoming jobs, and the performance unpredictability and
                 cost of resources. We first compare the two main
                 provisioning strategies (reserved and on-demand
                 resources) on Google Compute Engine (GCE) using three
                 representative workload scenarios with batch and
                 latency-critical applications. We show that either
                 approach is suboptimal for performance or cost. We then
                 present HCloud, a hybrid provisioning system that uses
                 both reserved and on-demand resources. HCloud
                 determines which jobs should be mapped to reserved
                 versus on-demand resources based on overall load, and
                 resource unpredictability. It also determines the
                 optimal instance size an application needs to satisfy
                 its Quality of Service (QoS) constraints. We
                 demonstrate that hybrid configurations improve
                 performance by 2.1x compared to fully on-demand
                 provisioning, and reduce cost by 46\% compared to fully
                 reserved systems. We also show that hybrid strategies
                 are robust to variation in system and job parameters,
                 such as cost and system load.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Yu:2016:CWM,
  author =       "Xiao Yu and Pallavi Joshi and Jianwu Xu and Guoliang
                 Jin and Hui Zhang and Guofei Jiang",
  title =        "{CloudSeer}: Workflow Monitoring of Cloud
                 Infrastructures via Interleaved Logs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "489--502",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872407",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cloud infrastructures provide a rich set of management
                 tasks that operate computing, storage, and networking
                 resources in the cloud. Monitoring the executions of
                 these tasks is crucial for cloud providers to promptly
                 find and understand problems that compromise cloud
                 availability. However, such monitoring is challenging
                 because there are multiple distributed service
                 components involved in the executions. CloudSeer
                 enables effective workflow monitoring. It takes a
                 lightweight non-intrusive approach that purely works on
                 interleaved logs widely existing in cloud
                 infrastructures. CloudSeer first builds an automaton
                 for the workflow of each management task based on
                 normal executions, and then it checks log messages
                 against a set of automata for workflow divergences in a
                 streaming manner. Divergences found during the checking
                 process indicate potential execution problems, which
                 may or may not be accompanied by error log messages.
                 For each potential problem, CloudSeer outputs necessary
                 context information including the affected task
                 automaton and related log messages hinting where the
                 problem occurs to help further diagnosis. Our
                 experiments on OpenStack, a popular open-source cloud
                 infrastructure, show that CloudSeer's efficiency and
                 problem-detection capability are suitable for online
                 monitoring.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Kwon:2016:LCI,
  author =       "Yonghwi Kwon and Dohyeong Kim and William Nick Sumner
                 and Kyungtae Kim and Brendan Saltaformaggio and Xiangyu
                 Zhang and Dongyan Xu",
  title =        "{LDX}: Causality Inference by Lightweight Dual
                 Execution",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "503--515",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872395",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Causality inference, such as dynamic taint analysis,
                 has many applications (e.g., information leak
                 detection). It determines whether an event e is
                 causally dependent on a preceding event c during
                 execution. We develop a new causality inference engine
                 LDX. Given an execution, it spawns a slave execution,
                 in which it mutates c and observes whether any change
                 is induced at e. To preclude non-determinism, LDX
                 couples the executions by sharing syscall outcomes. To
                 handle path differences induced by the perturbation, we
                 develop a novel on-the-fly execution alignment scheme
                 that maintains a counter to reflect the progress of
                 execution. The scheme relies on program analysis and
                 compiler transformation. LDX can effectively detect
                 information leak and security attacks with an average
                 overhead of 6.08\% while running the master and the
                 slave concurrently on separate CPUs, much lower than
                 existing systems that require instruction level
                 monitoring. Furthermore, it has much better accuracy in
                 causality inference.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Leesatapornwongsa:2016:TTN,
  author =       "Tanakorn Leesatapornwongsa and Jeffrey F. Lukman and
                 Shan Lu and Haryadi S. Gunawi",
  title =        "{TaxDC}: a Taxonomy of Non-Deterministic Concurrency
                 Bugs in Datacenter Distributed Systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "517--530",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872374",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present TaxDC, the largest and most comprehensive
                 taxonomy of non-deterministic concurrency bugs in
                 distributed systems. We study 104 distributed
                 concurrency (DC) bugs from four widely-deployed
                 cloud-scale datacenter distributed systems, Cassandra,
                 Hadoop MapReduce, HBase and ZooKeeper. We study DC-bug
                 characteristics along several axes of analysis such as
                 the triggering timing condition and input
                 preconditions, error and failure symptoms, and fix
                 strategies, collectively stored as 2,083 classification
                 labels in TaxDC database. We discuss how our study can
                 open up many new research directions in combating DC
                 bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Mao:2016:RFR,
  author =       "Junjie Mao and Yu Chen and Qixue Xiao and Yuanchun
                 Shi",
  title =        "{RID}: Finding Reference Count Bugs with Inconsistent
                 Path Pair Checking",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "531--544",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872389",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reference counts are widely used in OS kernels for
                 resource management. However, reference counts are not
                 trivial to be used correctly in large scale programs
                 because it is left to developers to make sure that an
                 increment to a reference count is always paired with a
                 decrement. This paper proposes inconsistent path pair
                 checking, a novel technique that can statically
                 discover bugs related to reference counts without
                 knowing how reference counts should be changed in a
                 function. A prototype called RID is implemented and
                 evaluations show that RID can discover more than 80
                 bugs which were confirmed by the developers in the
                 latest Linux kernel. The results also show that RID
                 tends to reveal bugs caused by developers'
                 misunderstanding on API specifications or error
                 conditions that are not handled properly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Zhang:2016:MPU,
  author =       "Huazhe Zhang and Henry Hoffmann",
  title =        "Maximizing Performance Under a Power Cap: a Comparison
                 of Hardware, Software, and Hybrid Techniques",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "545--559",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872375",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Power and thermal dissipation constrain multicore
                 performance scaling. Modern processors are built such
                 that they could sustain damaging levels of power
                 dissipation, creating a need for systems that can
                 implement processor power caps. A particular challenge
                 is developing systems that can maximize performance
                 within a power cap, and approaches have been proposed
                 in both software and hardware. Software approaches are
                 flexible, allowing multiple hardware resources to be
                 coordinated for maximum performance, but software is
                 slow, requiring a long time to converge to the power
                 target. In contrast, hardware power capping quickly
                 converges to the the power cap, but only manages
                 voltage and frequency, limiting its potential
                 performance. In this work we propose PUPiL, a hybrid
                 software/hardware power capping system. Unlike previous
                 approaches, PUPiL combines hardware's fast reaction
                 time with software's flexibility. We implement PUPiL on
                 real Linux/x86 platform and compare it to Intel's
                 commercial hardware power capping system for both
                 single and multi-application workloads. We find PUPiL
                 provides the same reaction time as Intel's hardware
                 with significantly higher performance. On average,
                 PUPiL outperforms hardware by from 1:18-2:4 depending
                 on workload and power target. Thus, PUPiL provides a
                 promising way to enforce power caps with greater
                 performance than current state-of-the-art hardware-only
                 approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Fan:2016:CSG,
  author =       "Songchun Fan and Seyed Majid Zahedi and Benjamin C.
                 Lee",
  title =        "The Computational Sprinting Game",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "561--575",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872383",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computational sprinting is a class of mechanisms that
                 boost performance but dissipate additional power. We
                 describe a sprinting architecture in which many,
                 independent chip multiprocessors share a power supply
                 and sprints are constrained by the chips' thermal
                 limits and the rack's power limits. Moreover, we
                 present the computational sprinting game, a multi-agent
                 perspective on managing sprints. Strategic agents
                 decide whether to sprint based on application phases
                 and system conditions. The game produces an equilibrium
                 that improves task throughput for data analytics
                 workloads by 4-6$ \times $ over prior greedy heuristics
                 and performs within 90\% of an upper bound on
                 throughput from a globally optimized policy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Colin:2016:EIF,
  author =       "Alexei Colin and Graham Harvey and Brandon Lucia and
                 Alanson P. Sample",
  title =        "An Energy-interference-free Hardware-Software Debugger
                 for Intermittent Energy-harvesting Systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "577--589",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872409",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy-autonomous computing devices have the potential
                 to extend the reach of computing to a scale beyond
                 either wired or battery-powered systems. However, these
                 devices pose a unique set of challenges to application
                 developers who lack both hardware and software support
                 tools. Energy harvesting devices experience power
                 intermittence which causes the system to reset and
                 power-cycle unpredictably, tens to hundreds of times
                 per second. This can result in code execution errors
                 that are not possible in continuously-powered systems
                 and cannot be diagnosed with conventional debugging
                 tools such as JTAG and/or oscilloscopes. We propose the
                 Energy-interference-free Debugger, a hardware and
                 software platform for monitoring and debugging
                 intermittent systems without adversely effecting their
                 energy state. The Energy-interference-free Debugger
                 re-creates a familiar debugging environment for
                 intermittent software and augments it with debugging
                 primitives for effective diagnosis of intermittence
                 bugs. Our evaluation of the Energy-interference-free
                 Debugger quantifies its energy-interference-freedom and
                 shows its value in a set of debugging tasks in complex
                 test programs and several real applications, including
                 RFID code and a machine-learning-based activity
                 recognition system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Witchel:2016:PPW,
  author =       "Emmett Witchel",
  title =        "Programmer Productivity in a World of Mushy
                 Interfaces: Challenges of the Post-{ISA} Reality",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "591--591",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2876511",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Since 1964, we had the notion that the instruction set
                 architecture (ISA) is a useful and fairly opaque
                 abstraction layer between hardware and software.
                 Software rode hardware's performance wave while
                 remaining gloriously oblivious to hardware's growing
                 complexity. Unfortunately, the jig is up. We still have
                 ISAs, but the abstraction no longer offers seamless
                 portability---parallel software needs to be tuned for
                 different core counts, and heterogeneous processing
                 elements (CPUs, GPUs, accelerators) further complicate
                 programmability. We are better at building large-scale
                 heterogeneous processors than we are at programming
                 them. Maintaining software across multiple current
                 platforms is difficult and porting to future platforms
                 is also difficult. There have been many technical
                 responses: virtual ISAs (e.g., NVIDIA's PTX),
                 higher-level programming interfaces (e.g., CUDA or
                 OpenCL), and late-stage compilation and
                 platform-specific tailoring (e.g., Android ART), etc. A
                 team of opinionated experts, drawn from the three
                 ASPLOS communities will examine the problem of
                 programmer productivity in the post-ISA world, first
                 from the perspective of their area of expertise and
                 then noting the contributions from the other two
                 communities. What research will save us and how? This
                 wide-ranging debate will frame important research areas
                 for future work while being grounded in frank
                 discussion about what has succeeded in the past.
                 Attendees can expect actionable insight into important
                 research issues as well an entertaining discussion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Angstadt:2016:RPP,
  author =       "Kevin Angstadt and Westley Weimer and Kevin Skadron",
  title =        "{RAPID} Programming of Pattern-Recognition
                 Processors",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "593--605",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872393",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present RAPID, a high-level programming language
                 and combined imperative and declarative model for
                 programming pattern-recognition processors, such as
                 Micron's Automata Processor (AP). The AP is a novel,
                 non-Von Neumann architecture for direct execution of
                 non-deterministic finite automata (NFAs), and has been
                 demonstrated to provide substantial speedup for a
                 variety of data-processing applications. RAPID is
                 clear, maintainable, concise, and efficient both at
                 compile and run time. Language features, such as code
                 abstraction and parallel control structures, map well
                 to pattern-matching problems, providing clarity and
                 maintainability. For generation of efficient runtime
                 code, we present algorithms to convert RAPID programs
                 into finite automata. Further, we introduce a
                 tessellation technique for configuring the AP, which
                 significantly reduces compile time, increases
                 programmer productivity, and improves maintainability.
                 We evaluate five RAPID programs against custom,
                 baseline implementations previously demonstrated to be
                 significantly accelerated by the AP. We find that RAPID
                 programs are much shorter in length, are expressible at
                 a higher level of abstraction than their handcrafted
                 counterparts, and yield generated code that is often
                 more compact. In addition, our tessellation technique
                 for configuring the AP has comparable device
                 utilization to, and results in compilation that is up
                 to four orders of magnitude faster than, current
                 solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Sui:2016:PCA,
  author =       "Xin Sui and Andrew Lenharth and Donald S. Fussell and
                 Keshav Pingali",
  title =        "Proactive Control of Approximate Programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "607--621",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872402",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Approximate computing trades off accuracy of results
                 for resources such as energy or computing time. There
                 is a large and rapidly growing literature on
                 approximate computing that has focused mostly on
                 showing the benefits of approximate computing. However,
                 we know relatively little about how to control
                 approximation in a disciplined way. In this paper, we
                 address the problem of controlling approximation for
                 non-streaming programs that have a set of ``knobs''
                 that can be dialed up or down to control the level of
                 approximation of different components in the program.
                 We formulate this control problem as a constrained
                 optimization problem, and describe a system called
                 Capri that uses machine learning to learn cost and
                 error models for the program, and uses these models to
                 determine, for a desired level of approximation, knob
                 settings that optimize metrics such as running time or
                 energy usage. Experimental results with complex
                 benchmarks from different problem domains demonstrate
                 the effectiveness of this approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Park:2016:ATC,
  author =       "Jongse Park and Emmanuel Amaro and Divya Mahajan and
                 Bradley Thwaites and Hadi Esmaeilzadeh",
  title =        "{AxGames}: Towards Crowdsourcing Quality Target
                 Determination in Approximate Computing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "623--636",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872376",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Approximate computing trades quality of application
                 output for higher efficiency and performance.
                 Approximation is useful only if its impact on
                 application output quality is acceptable to the users.
                 However, there is a lack of systematic solutions and
                 studies that explore users' perspective on the effects
                 of approximation. In this paper, we seek to provide one
                 such solution for the developers to probe and discover
                 the boundary of quality loss that most users will deem
                 acceptable. We propose AxGames, a crowdsourced solution
                 that enables developers to readily infer a statistical
                 common ground from the general public through three
                 entertaining games. The users engage in these games by
                 betting on their opinion about the quality loss of the
                 final output while the AxGames framework collects
                 statistics about their perceptions. The framework then
                 statistically analyzes the results to determine the
                 acceptable levels of quality for a pair of
                 (application, approximation technique). The three games
                 are designed such that they effectively capture quality
                 requirements with various tradeoffs and contexts. To
                 evaluate AxGames, we examine seven diverse applications
                 that produce user perceptible outputs and cover a wide
                 range of domains, including image processing, optical
                 character recognition, speech to text conversion, and
                 audio processing. We recruit 700 participants/users
                 through Amazon's Mechanical Turk to play the games that
                 collect statistics about their perception on different
                 levels of quality. Subsequently, the AxGames framework
                 uses the Clopper-Pearson exact method, which computes a
                 binomial proportion confidence interval, to analyze the
                 collected statistics for each level of quality. Using
                 this analysis, AxGames can statistically project the
                 quality level that satisfies a given percentage of
                 users. The developers can use these statistical
                 projections to tune the level of approximation based on
                 the user experience. We find that the level of
                 acceptable quality loss significantly varies across
                 applications. For instance, to satisfy 90\% of users,
                 the level of acceptable quality loss is 2\% for one
                 application (image processing) and 26\% for another
                 (audio processing). Moreover, the pattern with which
                 the crowd responds to approximation takes significantly
                 different shape and form depending on the class of
                 applications. These results confirm the necessity of
                 solutions that systematically explore the effect of
                 approximation on the end user experience.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Bornholt:2016:DBA,
  author =       "James Bornholt and Randolph Lopez and Douglas M.
                 Carmean and Luis Ceze and Georg Seelig and Karin
                 Strauss",
  title =        "A {DNA}-Based Archival Storage System",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "637--649",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872397",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Demand for data storage is growing exponentially, but
                 the capacity of existing storage media is not keeping
                 up. Using DNA to archive data is an attractive
                 possibility because it is extremely dense, with a raw
                 limit of 1 exabyte/mm$^3$ (109 GB/mm$^3$ ), and
                 long-lasting, with observed half-life of over 500
                 years. This paper presents an architecture for a
                 DNA-based archival storage system. It is structured as
                 a key-value store, and leverages common biochemical
                 techniques to provide random access. We also propose a
                 new encoding scheme that offers controllable
                 redundancy, trading off reliability for density. We
                 demonstrate feasibility, random access, and robustness
                 of the proposed encoding with wet lab experiments
                 involving 151 kB of synthesized DNA and a 42 kB
                 random-access subset, and simulation experiments of
                 larger sets calibrated to the wet lab experiments.
                 Finally, we highlight trends in biotechnology that
                 indicate the impending practicality of DNA storage for
                 much larger datasets.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Prabhakar:2016:GCH,
  author =       "Raghu Prabhakar and David Koeplinger and Kevin J.
                 Brown and HyoukJoong Lee and Christopher {De Sa} and
                 Christos Kozyrakis and Kunle Olukotun",
  title =        "Generating Configurable Hardware from Parallel
                 Patterns",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "651--665",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872415",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In recent years the computing landscape has seen an
                 increasing shift towards specialized accelerators.
                 Field programmable gate arrays (FPGAs) are particularly
                 promising for the implementation of these accelerators,
                 as they offer significant performance and energy
                 improvements over CPUs for a wide class of applications
                 and are far more flexible than fixed-function ASICs.
                 However, FPGAs are difficult to program. Traditional
                 programming models for reconfigurable logic use
                 low-level hardware description languages like Verilog
                 and VHDL, which have none of the productivity features
                 of modern software languages but produce very efficient
                 designs, and low-level software languages like C and
                 OpenCL coupled with high-level synthesis (HLS) tools
                 that typically produce designs that are far less
                 efficient. Functional languages with parallel patterns
                 are a better fit for hardware generation because they
                 provide high-level abstractions to programmers with
                 little experience in hardware design and avoid many of
                 the problems faced when generating hardware from
                 imperative languages. In this paper, we identify two
                 important optimizations for using parallel patterns to
                 generate efficient hardware: tiling and metapipelining.
                 We present a general representation of tiled parallel
                 patterns, and provide rules for automatically tiling
                 patterns and generating metapipelines. We demonstrate
                 experimentally that these optimizations result in
                 speedups up to 39.4$ \times $ on a set of benchmarks
                 from the data analytics domain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Chang:2016:DLD,
  author =       "Li-Wen Chang and Hee-Seok Kim and Wen-mei W. Hwu",
  title =        "{DySel}: Lightweight Dynamic Selection for
                 Kernel-based Data-parallel Programming Model",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "667--680",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872373",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The rising pressure for simultaneously improving
                 performance and reducing power is driving more
                 diversity into all aspects of computing devices. An
                 algorithm that is well-matched to the target hardware
                 can run multiple times faster and more energy
                 efficiently than one that is not. The problem is
                 complicated by the fact that a program's input also
                 affects the appropriate choice of algorithm. As a
                 result, software developers have been faced with the
                 challenge of determining the appropriate algorithm for
                 each potential combination of target device and data.
                 This paper presents DySel, a novel runtime system for
                 automating such determination for kernel-based data
                 parallel programming models such as OpenCL, CUDA,
                 OpenACC, and C++AMP. These programming models cover
                 many applications that demand high performance in
                 mobile, cloud and high-performance computing. DySel
                 systematically deploys candidate kernels on a small
                 portion of the actual data to determine which achieves
                 the best performance for the hardware-data combination.
                 The test-deployment, referred to as micro-profiling,
                 contributes to the final execution result and incurs
                 less than 8\% of overhead in the worst observed case
                 when compared to an oracle. We show four major use
                 cases where DySel provides significantly more
                 consistent performance without tedious effort from the
                 developer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Chen:2016:BQA,
  author =       "Quan Chen and Hailong Yang and Jason Mars and Lingjia
                 Tang",
  title =        "{Baymax}: {QoS} Awareness and Increased Utilization
                 for Non-Preemptive Accelerators in Warehouse Scale
                 Computers",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "681--696",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872368",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern warehouse-scale computers (WSCs) are being
                 outfitted with accelerators to provide the significant
                 compute required by emerging intelligent personal
                 assistant (IPA) workloads such as voice recognition,
                 image classification, and natural language processing.
                 It is well known that the diurnal user access pattern
                 of user-facing services provides a strong incentive to
                 co-locate applications for better accelerator
                 utilization and efficiency, and prior work has focused
                 on enabling co-location on multicore processors.
                 However, interference when co-locating applications on
                 non-preemptive accelerators is fundamentally different
                 than contention on multi-core CPUs and introduces a new
                 set of challenges to reduce QoS violation. To address
                 this open problem, we first identify the underlying
                 causes for QoS violation in accelerator-outfitted
                 servers. Our experiments show that queuing delay for
                 the compute resources and PCI-e bandwidth contention
                 for data transfer are the main two factors that
                 contribute to the long tails of user-facing
                 applications. We then present Baymax, a runtime system
                 that orchestrates the execution of compute tasks from
                 different applications and mitigates PCI-e bandwidth
                 contention to deliver the required QoS for user-facing
                 applications and increase the accelerator utilization.
                 Using DjiNN, a deep neural network service, Sirius, an
                 end-to-end IPA workload, and traditional applications
                 on a Nvidia K40 GPU, our evaluation shows that Baymax
                 improves the accelerator utilization by 91.3\% while
                 achieving the desired 99\%-ile latency target for for
                 user-facing applications. In fact, Baymax reduces the
                 99\%-ile latency of user-facing applications by up to
                 195x over default execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Nowatzki:2016:ABS,
  author =       "Tony Nowatzki and Karthikeyan Sankaralingam",
  title =        "Analyzing Behavior Specialized Acceleration",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "697--711",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872412",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hardware specialization has become a promising
                 paradigm for overcoming the inefficiencies of general
                 purpose microprocessors. Of significant interest are
                 Behavioral Specialized Accelerators (BSAs), which are
                 designed to efficiently execute code with only certain
                 properties, but remain largely configurable or
                 programmable. The most important strength of BSAs ---
                 their ability to target a wide variety of codes ---
                 also makes their interactions and analysis complex,
                 raising the following questions: can multiple BSAs be
                 composed synergistically, what are their interactions
                 with the general purpose core, and what combinations
                 favor which workloads? From a methodological
                 standpoint, BSAs are also challenging, as they each
                 require ISA development, compiler and assembler
                 extensions, and either simulator or RTL models. To
                 study the potential of BSAs, we propose a novel
                 modeling technique called the Transformable Dependence
                 Graph (TDG) --- a higher level alternative to the
                 time-consuming traditional compiler+simulator approach,
                 while still enabling detailed microarchitectural models
                 for both general cores and accelerators. We then
                 propose a multi-BSA organization, called ExoCore, which
                 we model and study using the TDG. A design space
                 exploration reveals that an ExoCore organization can
                 push designs beyond the established energy-performance
                 frontiers for general purpose cores. For example, a
                 2-wide OOO processor with three BSAs matches the
                 performance of a conventional 6-wide OOO core, has 40\%
                 lower area, and is 2.6x more energy efficient.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Yoon:2016:PPI,
  author =       "Man-Ki Yoon and Negin Salajegheh and Yin Chen and
                 Mihai Christodorescu",
  title =        "{PIFT}: Predictive Information-Flow Tracking",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "713--725",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872403",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Phones today carry sensitive information and have a
                 great number of ways to communicate that data. As a
                 result, malware that steal money, information, or
                 simply disable functionality have hit the app stores.
                 Current security solutions for preventing undesirable
                 data leaks are mostly high-overhead and have not been
                 practical enough for smartphones. In this paper, we
                 show that simply monitoring just some instructions
                 (only memory loads and stores) it is possible to
                 achieve low overhead, highly accurate information flow
                 tracking. Our method achieves 98\% accuracy (0\% false
                 positive and 2\% false negative) over DroidBench and
                 was able to successfully catch seven real-world malware
                 instances that steal phone number, location, and device
                 ID using SMS messages and HTTP connections.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Venkat:2016:HHI,
  author =       "Ashish Venkat and Sriskanda Shamasunder and Hovav
                 Shacham and Dean M. Tullsen",
  title =        "{HIPStR}: Heterogeneous-{ISA} Program State
                 Relocation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "727--741",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872408",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous Chip Multiprocessors have been shown to
                 provide significant performance and energy efficiency
                 gains over homogeneous designs. Recent research has
                 expanded the dimensions of heterogeneity to include
                 diverse Instruction Set Architectures, called
                 Heterogeneous-ISA Chip Multiprocessors. This work
                 leverages such an architecture to realize substantial
                 new security benefits, and in particular, to thwart
                 Return-Oriented Programming. This paper proposes a
                 novel security defense called HIPStR ---
                 Heterogeneous-ISA Program State Relocation --- that
                 performs dynamic randomization of run-time program
                 state, both within and across ISAs. This technique
                 outperforms the state-of-the-art just-in-time code
                 reuse (JIT-ROP) defense by an average of 15.6\%, while
                 simultaneously providing greater security guarantees
                 against classic return-into-libc, ROP, JOP, brute
                 force, JIT-ROP, and several evasive variants.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Aweke:2016:ASB,
  author =       "Zelalem Birhanu Aweke and Salessawi Ferede Yitbarek
                 and Rui Qiao and Reetuparna Das and Matthew Hicks and
                 Yossi Oren and Todd Austin",
  title =        "{ANVIL}: Software-Based Protection Against
                 Next-Generation Rowhammer Attacks",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "743--755",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872390",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Ensuring the integrity and security of the memory
                 system is critical. Recent studies have shown serious
                 security concerns due to ``rowhammer'' attacks, where
                 repeated accesses to a row of memory cause bit flips in
                 adjacent rows. Recent work by Google's Project Zero has
                 shown how to leverage rowhammer-induced bit-flips as
                 the basis for security exploits that include malicious
                 code injection and memory privilege escalation. Being
                 an important security concern, industry has attempted
                 to defend against rowhammer attacks. Deployed defenses
                 employ two strategies: (1) doubling the system DRAM
                 refresh rate and (2) restricting access to the CLFLUSH
                 instruction that attackers use to bypass the cache to
                 increase memory access frequency (i.e., the rate of
                 rowhammering). We demonstrate that such defenses are
                 inadequte: we implement rowhammer attacks that both
                 avoid using the CLFLUSH instruction and cause bit flips
                 with a doubled refresh rate. Our next-generation
                 CLFLUSH-free rowhammer attack bypasses the cache by
                 manipulating cache replacement state to allow frequent
                 misses out of the last-level cache to DRAM rows of our
                 choosing. To protect existing systems from more
                 advanced rowhammer attacks, we develop a software-based
                 defense, ANVIL, which thwarts all known rowhammer
                 attacks on existing systems. ANVIL detects rowhammer
                 attacks by tracking the locality of DRAM accesses using
                 existing hardware performance counters. Our detector
                 identifies the rows being frequently accessed (i.e.,
                 the aggressors), then selectively refreshes the nearby
                 victim rows to prevent hammering. Experiments running
                 on real hardware with the SPEC2006 benchmarks show that
                 ANVIL has less than a 1\% false positive rate and an
                 average slowdown of 1\%. ANVIL is low-cost and robust,
                 and our experiments indicate that it is an effective
                 approach for protecting existing and future systems
                 from even advanced rowhammer attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Didona:2016:PAM,
  author =       "Diego Didona and Nuno Diegues and Anne-Marie Kermarrec
                 and Rachid Guerraoui and Ricardo Neves and Paolo
                 Romano",
  title =        "{ProteusTM}: Abstraction Meets Performance in
                 Transactional Memory",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "757--771",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872385",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Transactional Memory (TM) paradigm promises to
                 greatly simplify the development of concurrent
                 applications. This led, over the years, to the creation
                 of a plethora of TM implementations delivering wide
                 ranges of performance across workloads. Yet, no
                 universal implementation fits each and every workload.
                 In fact, the best TM in a given workload can reveal to
                 be disastrous for another one. This forces developers
                 to face the complex task of tuning TM implementations,
                 which significantly hampers their wide adoption. In
                 this paper, we address the challenge of automatically
                 identifying the best TM implementation for a given
                 workload. Our proposed system, ProteusTM, hides behind
                 the TM interface a large library of implementations.
                 Underneath, it leverages a novel multi-dimensional
                 online optimization scheme, combining two popular
                 learning techniques: Collaborative Filtering and
                 Bayesian Optimization. We integrated ProteusTM in GCC
                 and demonstrate its ability to switch between TMs and
                 adapt several configuration parameters (e.g., number of
                 threads). We extensively evaluated ProteusTM, obtaining
                 average performance {$<$3}\% from optimal, and gains up
                 to 100x over static alternatives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Shalev:2016:CCS,
  author =       "Noam Shalev and Eran Harpaz and Hagar Porat and Idit
                 Keidar and Yaron Weinsberg",
  title =        "{CSR}: Core Surprise Removal in Commodity Operating
                 Systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "773--787",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872369",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "One of the adverse effects of shrinking transistor
                 sizes is that processors have become increasingly prone
                 to hardware faults. At the same time, the number of
                 cores per die rises. Consequently, core failures can no
                 longer be ruled out, and future operating systems for
                 many-core machines will have to incorporate fault
                 tolerance mechanisms. We present CSR, a strategy for
                 recovery from unexpected permanent processor faults in
                 commodity operating systems. Our approach overcomes
                 surprise removal of faulty cores, and also tolerates
                 cascading core failures. When a core fails in user
                 mode, CSR terminates the process executing on that core
                 and migrates the remaining processes in its run-queue
                 to other cores. We further show how hardware
                 transactional memory may be used to overcome failures
                 in critical kernel code. Our solution is scalable,
                 incurs low overhead, and is designed to integrate into
                 modern operating systems. We have implemented it in the
                 Linux kernel, using Haswell's Transactional
                 Synchronization Extension, and tested it on a real
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Gangwani:2016:CBS,
  author =       "Tanmay Gangwani and Adam Morrison and Josep
                 Torrellas",
  title =        "{CASPAR}: Breaking Serialization in Lock-Free
                 Multicore Synchronization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "4",
  pages =        "789--804",
  month =        apr,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2954679.2872400",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Thu Jun 9 17:13:59 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In multicores, performance-critical synchronization is
                 increasingly performed in a lock-free manner using
                 atomic instructions such as CAS or LL/SC. However, when
                 many processors synchronize on the same variable,
                 performance can still degrade significantly. Contending
                 writes get serialized, creating a non-scalable
                 condition. Past proposals that build hardware queues of
                 synchronizing processors do not fundamentally solve
                 this problem --- at best, they help to efficiently
                 serialize the contending writes. This paper proposes a
                 novel architecture that breaks the serialization of
                 hardware queues and enables the queued processors to
                 perform lock-free synchronization in parallel. The
                 architecture, called CASPAR, is able to (1) execute the
                 CASes in the queued-up processors in parallel through
                 eager forwarding of expected values, and (2) validate
                 the CASes in parallel and dequeue groups of processors
                 at a time. The result is highly-scalable
                 synchronization. We evaluate CASPAR with simulations of
                 a 64-core chip. Compared to existing proposals with
                 hardware queues, CASPAR improves the throughput of
                 kernels by 32\% on average, and reduces the execution
                 time of the sections considered in lock-free versions
                 of applications by 47\% on average. This makes these
                 sections 2.5x faster than in the original
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '16 conference proceedings.",
}

@Article{Spink:2016:EAI,
  author =       "Tom Spink and Harry Wagstaff and Bj{\"o}rn Franke",
  title =        "Efficient asynchronous interrupt handling in a
                 full-system instruction set simulator",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "1--10",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907953",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Instruction set simulators (ISS) have many uses in
                 embedded software and hardware development and are
                 typically based on dynamic binary translation (DBT),
                 where frequently executed regions of guest instructions
                 are compiled into host instructions using a
                 just-in-time (JIT) compiler. Full-system simulation,
                 which necessitates handling of asynchronous interrupts
                 from e.g. timers and I/O devices, complicates matters
                 as control flow is interrupted unpredictably and
                 diverted from the current region of code. In this paper
                 we present a novel scheme for handling of asynchronous
                 interrupts, which integrates seamlessly into a
                 region-based dynamic binary translator. We first show
                 that our scheme is correct, i.e. interrupt handling is
                 not deferred indefinitely, even in the presence of code
                 regions comprising control flow loops. We demonstrate
                 that our new interrupt handling scheme is efficient as
                 we minimise the number of inserted checks. Interrupt
                 handlers are also presented to the JIT compiler and
                 compiled to native code, further enhancing the
                 performance of our system. We have evaluated our scheme
                 in an ARM simulator using a region-based JIT
                 compilation strategy. We demonstrate that our solution
                 reduces the number of dynamic interrupt checks by 73\%,
                 reduces interrupt service latency by 26\% and improves
                 throughput of an I/O bound workload by 7\%, over
                 traditional per-block schemes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Robinson:2016:CCM,
  author =       "Forrest J. Robinson and Michael R. Jantz and Prasad A.
                 Kulkarni",
  title =        "Code cache management in managed language {VMs} to
                 reduce memory consumption for embedded systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "11--20",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907958",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The compiled native code generated by a just-in-time
                 (JIT) compiler in managed language virtual machines
                 (VM) is placed in a region of memory called the code
                 cache. Code cache management (CCM) in a VM is
                 responsible to find and evict methods from the code
                 cache to maintain execution correctness and manage
                 program performance for a given code cache size or
                 memory budget. Effective CCM can also boost program
                 speed by enabling more aggressive JIT compilation,
                 powerful optimizations, and improved hardware
                 instruction cache and I-TLB performance. Though
                 important, CCM is an overlooked component in VMs. We
                 find that the default CCM policies in Oracle's
                 production-grade HotSpot VM perform poorly even at
                 modest memory pressure. We develop a detailed
                 simulation-based framework to model and evaluate the
                 potential efficiency of many different CCM policies in
                 a controlled and realistic, but VM-independent
                 environment. We make the encouraging discovery that
                 effective CCM policies can sustain high program
                 performance even for very small cache sizes. Our
                 simulation study provides the rationale and motivation
                 to improve CCM strategies in existing VMs. We implement
                 and study the properties of several CCM policies in
                 HotSpot. We find that in spite of working within the
                 bounds of the HotSpot VM's current CCM sub-system, our
                 best CCM policy implementation in HotSpot improves
                 program performance over the default CCM algorithm by
                 39\%, 41\%, 55\%, and 50\% with code cache sizes that
                 are 90\%, 75\%, 50\%, and 25\% of the desired cache
                 size, on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Nobre:2016:GBI,
  author =       "Ricardo Nobre and Luiz G. A. Martins and Jo{\~a}o M.
                 P. Cardoso",
  title =        "A graph-based iterative compiler pass selection and
                 phase ordering approach",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "21--30",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907959",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nowadays compilers include tens or hundreds of
                 optimization passes, which makes it difficult to find
                 sequences of optimizations that achieve compiled code
                 more optimized than the one obtained using typical
                 compiler options such as -O2 and -O3. The problem
                 involves both the selection of the compiler passes to
                 use and their ordering in the compilation pipeline. The
                 improvement achieved by the use of custom phase orders
                 for each function can be significant, and thus
                 important to satisfy strict requirements such as the
                 ones present in high-performance embedded computing
                 systems. In this paper we present a new and fast
                 iterative approach to the phase selection and ordering
                 challenges resulting in compiled code with higher
                 performance than the one achieved with the standard
                 optimization levels of the LLVM compiler. The obtained
                 performance improvements are comparable with the ones
                 achieved by other iterative approaches while requiring
                 considerably less time and resources. Our approach is
                 based on sampling over a graph representing transitions
                 between compiler passes. We performed a number of
                 experiments targeting the LEON3 microarchitecture using
                 the Clang/LLVM 3.7 compiler, considering 140 LLVM
                 passes and a set of 42 representative signal and image
                 processing C functions. An exhaustive cross-validation
                 shows our new exploration method is able to achieve a
                 geometric mean performance speedup of 1.28x over the
                 best individually selected -OX flag when considering
                 100,000 iterations; versus geometric mean speedups from
                 1.16x to 1.25x obtained with state-of-the-art iterative
                 methods not using the graph. From the set of
                 exploration methods tested, our new method is the only
                 one consistently finding compiler sequences that result
                 in performance improvements when considering 100 or
                 less exploration iterations. Specifically, it achieved
                 geometric mean speedups of 1.08x and 1.16x for 10 and
                 100 iterations, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Banerjee:2016:TVL,
  author =       "Kunal Banerjee and Chittaranjan Mandal and Dipankar
                 Sarkar",
  title =        "Translation validation of loop and arithmetic
                 transformations in the presence of recurrences",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "31--40",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907954",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compiler optimization of array-intensive programs
                 involves extensive application of loop transformations
                 and arithmetic transformations. Hence, translation
                 validation of array-intensive programs requires
                 manipulation of intervals of integers (representing
                 domains of array indices) and relations over such
                 intervals to account for loop transformations and
                 simplification of arithmetic expressions to handle
                 arithmetic transformations. A major obstacle for
                 verification of such programs is posed by the presence
                 of recurrences, whereby an element of an array gets
                 defined in a statement S inside a loop in terms of some
                 other element(s) of the same array which have been
                 previously defined through the same statement S.
                 Recurrences lead to cycles in the data-dependence graph
                 of a program which make dependence analyses and
                 simplifications (through closed-form representations)
                 of the data transformations difficult. Another
                 technique which works better for recurrences does not
                 handle arithmetic transformations. In this work, array
                 data-dependence graphs (ADDGs) are used to represent
                 both the original and the optimized versions of the
                 program and a validation scheme is proposed where the
                 cycles due to recurrences in the ADDGs are suitably
                 abstracted as acyclic subgraphs. Thus, this work
                 provides a unified equivalence checking framework to
                 handle loop and arithmetic transformations along with
                 most of the recurrences --- this combination of
                 features had not been achieved by a single verification
                 technique earlier.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Sui:2016:LOA,
  author =       "Yulei Sui and Xiaokang Fan and Hao Zhou and Jingling
                 Xue",
  title =        "Loop-oriented array- and field-sensitive pointer
                 analysis for automatic {SIMD} vectorization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "41--51",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907957",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compiler-based auto-vectorization is a promising
                 solution to automatically generate code that makes
                 efficient use of SIMD processors in high performance
                 platforms and embedded systems. Two main
                 auto-vectorization techniques, superword-level
                 parallelism vectorization (SLP) and loop-level
                 vectorization (LLV), require precise dependence
                 analysis on arrays and structs in order to vectorize
                 isomorphic scalar instructions and/or reduce dynamic
                 dependence checks incurred at runtime. The alias
                 analyses used in modern vectorizing compilers are
                 either intra-procedural (without tracking
                 inter-procedural data-flows) or inter-procedural (by
                 using field-insensitive models, which are too imprecise
                 in handling arrays and structs). This paper proposes an
                 inter-procedural Loop-oriented Pointer Analysis, called
                 LPA, for analyzing arrays and structs to support
                 aggressive SLP and LLV optimizations. Unlike
                 field-insensitive solutions that preallocate objects
                 for each memory allocation site, our approach uses a
                 fine-grained memory model to generate location sets
                 based on how structs and arrays are accessed. LPA can
                 precisely analyze arrays and nested aggregate
                 structures to enable SIMD optimizations for large
                 programs. By separating the location set generation as
                 an independent concern from the rest of the pointer
                 analysis, LPA is designed to reuse easily existing
                 points-to resolution algorithms. We evaluate LPA using
                 SLP and LLV, the two classic vectorization techniques
                 on a set of 20 CPU2000/2006 benchmarks. For SLP, LPA
                 enables it to vectorize a total of 133 more basic
                 blocks, with an average of 12.09 per benchmark,
                 resulting in the best speedup of 2.95\% for 173.applu.
                 For LLV, LPA has reduced a total of 319 static bound
                 checks, with an average of 22.79 per benchmark,
                 resulting in the best speedup of 7.18\% for 177.mesa.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Domagala:2016:GCT,
  author =       "Lukasz Domagala and Duco van Amstel and Fabrice
                 Rastello",
  title =        "Generalized cache tiling for dataflow programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "52--61",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907960",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The dataflow programming paradigm has facilitated the
                 expression of a great number of algorithmic
                 applications on embedded platforms in a wide variety of
                 applicative domains. Whether it is a Domain Specific
                 Language (DSL) or a more generalistic one, the dataflow
                 paradigm allows to intuitively state the successive
                 steps of an algorithm and link them through data
                 communications. The optimization of cache-memory in
                 this context has been a subject of interest since the
                 early '90s as the reuse and communication of data
                 between the agents of a dataflow program is a key
                 factor in achieving a high-performance implementation
                 within the reduced limits of embedded architectures. In
                 order to improve data reuse among the dataflow agents
                 we propose a modelisation of the communications and
                 data usage within a dataflow program. Aside from
                 providing an estimate of the amount of cache-misses
                 that a given scheduling generates, this model allows us
                 to specify the associated optimization problem in a
                 manner that is identical to loop-nest tiling. Improving
                 on the existing state-of-the-art methods we extend our
                 tiling technique to include non-uniform dependencies on
                 one of the dimensions of the iteration space. When
                 applying the proposed technique to dataflow programs
                 expressed within the StreamIt framework we are able to
                 showcase significant reductions in the number of
                 cache-misses for a majority of test-cases when compared
                 to existing optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Chu:2016:SEM,
  author =       "Duc-Hiep Chu and Joxan Jaffar and Rasool Maghareh",
  title =        "Symbolic execution for memory consumption analysis",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "62--71",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907955",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the advances in both hardware and software of
                 embedded systems in the past few years, dynamic memory
                 allocation can now be safely used in embedded software.
                 As a result, the need to develop methods to avoid heap
                 overflow errors in safety-critical embedded systems has
                 increased. Resource analysis of imperative programs
                 with non-regular loop patterns and signed integers, to
                 support both memory allocation and deallocation, has
                 long been an open problem. Existing methods can
                 generate symbolic bounds that are parametric w.r.t. the
                 program inputs; such bounds, however, are imprecise in
                 the presence of non-regular loop patterns. In this
                 paper, we present a worst-case memory consumption
                 analysis, based upon the framework of symbolic
                 execution. Our assumption is that loops (and
                 recursions) of to-be-analyzed programs are indeed
                 bounded. We then can exhaustively unroll loops and the
                 memory consumption of each iteration can be precisely
                 computed and summarized for aggregation. Because of
                 path-sensitivity, our algorithm generates more precise
                 bounds. Importantly, we demonstrate that by introducing
                 a new concept of reuse, symbolic execution scales to a
                 set of realistic benchmark programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Metta:2016:TSM,
  author =       "Ravindra Metta and Martin Becker and Prasad Bokil and
                 Samarjit Chakraborty and R. Venkatesh",
  title =        "{TIC}: a scalable model checking based approach to
                 {WCET} estimation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "72--81",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907961",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The application of Model Checking to compute WCET has
                 not been explored as much as Integer Linear Programming
                 (ILP), primarily because model checkers fail to scale
                 for complex programs. These programs have loops with
                 large or unknown bounds, leading to a state space
                 explosion that model checkers cannot handle. To
                 overcome this, we have developed a technique, TIC, that
                 employs slicing, loop acceleration and
                 over-approximation on time-annotated source code,
                 enabling Model Checking to scale better for WCET
                 computation. Further, our approach is parametric, so
                 that the user can make a trade-off between the
                 tightness of WCET estimate and the analysis time. We
                 conducted experiments on the M{\"a}lardalen benchmarks
                 to evaluate the effect of various abstractions on the
                 WCET estimate and analysis time. Additionally, we
                 compared our estimates to those made by an ILP-based
                 analyzer and found that our estimates were tighter for
                 more than 30\% of the examples and were equal for the
                 rest.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Chen:2016:CIM,
  author =       "Kuan-Hsun Chen and Bj{\"o}rn B{\"o}nninghoff and
                 Jian-Jia Chen and Peter Marwedel",
  title =        "Compensate or ignore? {Meeting} control robustness
                 requirements through adaptive soft-error handling",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "82--91",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907952",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To avoid catastrophic events like unrecoverable system
                 failures on mobile and embedded systems caused by
                 soft-errors, software-based error detection and
                 compensation techniques have been proposed. Methods
                 like error-correction codes or redundant execution can
                 offer high flexibility and allow for
                 application-specific fault-tolerance selection without
                 the needs of special hardware supports. However, such
                 software-based approaches may lead to system overload
                 due to the execution time overhead. An adaptive
                 deployment of such techniques to meet both application
                 requirements and system constraints is desired. From
                 our case study, we observe that a control task can
                 tolerate limited errors with acceptable performance
                 loss. Such tolerance can be modeled as a (m,k)
                 constraint which requires at least m correct runs out
                 of any k consecutive runs to be correct. In this paper,
                 we discuss how a given (m,k) constraint can be
                 satisfied by adopting patterns of task instances with
                 individual error detection and compensation
                 capabilities. We introduce static strategies and
                 provide a formal feasibility analysis for validation.
                 Furthermore, we develop an adaptive scheme that extends
                 our initial approach with online awareness that
                 increases efficiency while preserving analysis results.
                 The effectiveness of our method is shown in a
                 real-world case study as well as for synthesized task
                 sets.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Chakraborty:2016:OCP,
  author =       "Prasenjit Chakraborty and Gautam Doshi and Shashank
                 Shekhar and Vikrant Kumar",
  title =        "Opportunity for compute partitioning in pursuit of
                 energy-efficient systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "92--101",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907956",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Performance of computing systems, from handhelds to
                 supercomputers, is increasingly constrained by the
                 energy consumed. A significant and increasing fraction
                 of the energy is consumed in the movement of data. In a
                 compute node, caches have been very effective in
                 reducing data movement by exploiting the available data
                 locality in programs. Program regions with poor data
                 locality, then effect most of the data movement, and
                 consequently consume an ever larger fraction of energy.
                 In this paper we explore the energy efficiency
                 opportunity of minimizing the data movement in
                 precisely such program regions, by first imagining the
                 possibility of compute near memory, and then
                 partitioning the program's execution between a compute
                 core and the compute near memory (CnM). Due to the
                 emergence of 3D stacked memory, a CnM implementation
                 appears more realistic. Our focus is on evaluating the
                 partitioning opportunity in applications and to do a
                 limit study of systems enabled with CnM capabilities to
                 understand and guide their architectural embodiment. We
                 describe an automated method of analyzing the data
                 access pattern of optimized workload binaries, via a
                 binary-instrumentation tool called SnapCnM, to identify
                 the beneficial program regions (loops) for CnM
                 execution.We also perform a limit study to evaluate the
                 impact of such partitioning over a range of parameters
                 affecting CnM design choices. Our results show that
                 compute partitioning a small ({$<$10}\%) fraction of a
                 workload can improve its energy efficiency from 3\%
                 (for compute-bound applications) to 27\% (for
                 memory-bound applications). From the study in this work
                 we discuss the important aspects that help to shape the
                 future CnM design space.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Phothilimthana:2016:CGR,
  author =       "Phitchaya Mangpo Phothilimthana and Michael Schuldt
                 and Rastislav Bodik",
  title =        "Compiling a gesture recognition application for a
                 low-power spatial architecture",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "102--112",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907962",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy efficiency is one of the main performance goals
                 when designing processors for embedded systems.
                 Typically, the simpler the processor, the less energy
                 it consumes. Thus, an ultra-low power multicore
                 processor will, likely have very small distributed
                 memory with a simple interconnect. To compile for such
                 an architecture, a partitioning strategy that can tune
                 between space and communication minimization is crucial
                 to fit a program in its limited resources and achieve
                 good performance. A careful program layout design is
                 also critical. Aside fulfilling the space constraint, a
                 compiler needs to be able to optimize for program
                 latency to satisfy a certain timing requirement as
                 well. To satisfy all aforementioned constraints, we
                 present a flexible code partitioning strategy and
                 light-weight mechanisms to express parallelism and
                 program layout. First, we compare two strategies for
                 partitioning program structures and introduce a
                 language construct to let programmers choose which
                 strategies to use and when. The compiler then
                 partitions program structures with a mix of both
                 strategies. Second, we add supports for
                 programmer-specified parallelism and program layout
                 through imposing additional spatial constraints to the
                 compiler. We evaluate our compiler by implementing an
                 accelerometer-based gesture recognition application on
                 GA144, a recent low-power minimalistic multicore
                 architecture. When compared to MSP430, GA144 is overall
                 19x more energy-efficient and 23x faster when running
                 this application. Without these inventions, this
                 application would not be able to fit on GA144.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Micolet:2016:MLA,
  author =       "Paul-Jules Micolet and Aaron Smith and Christophe
                 Dubach",
  title =        "A machine learning approach to mapping streaming
                 workloads to dynamic multicore processors",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "5",
  pages =        "113--122",
  month =        may,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980930.2907951",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:24 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dataflow programming languages facilitate the design
                 of data intensive programs such as streaming
                 applications commonly found in embedded systems. They
                 also expose parallelism that can be exploited using
                 multicore processors which are now part of the mobile
                 landscape. In recent years a shift has occurred towards
                 heterogeneity ( ARM big.LITTLE) and reconfigurability.
                 Dynamic Multicore Processors (DMPs) bridge the gap
                 between fully reconfigurable processors and homogeneous
                 multicore systems. They can re-allocate their resources
                 at runtime to create larger more powerful logical
                 processors fine-tuned to the workload. Unfortunately,
                 there exists no accurate method to determine how to
                 partition the cores in a DMP among application threads.
                 Often programmers rely on analyzing the application
                 manually and using a set of hand picked heuristics.
                 This leads to sub-optimal performance, reducing the
                 potential of DMPs. What is needed is a way to determine
                 the optimal partitioning and grouping of resources to
                 maximize performance. As a first step, this paper
                 studies the effect of thread partitioning and hardware
                 resource allocation on a set of StreamIt applications.
                 We show that the resulting space is not trivial and
                 exhibits a large performance variation depending on the
                 combination of parameters. We introduce a
                 machine-learning based methodology to tackle the space
                 complexity. Our machine-learning model is able to
                 directly predict the best combination of parameters
                 using static code features. The predicted set of
                 parameters leads to performance on-par with the best
                 performance found in a space of more than 32,000
                 configurations per application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '16 conference proceedings.",
}

@Article{Memarian:2016:DCE,
  author =       "Kayvan Memarian and Justus Matthiesen and James
                 Lingard and Kyndylan Nienhuis and David Chisnall and
                 Robert N. M. Watson and Peter Sewell",
  title =        "Into the depths of {C}: elaborating the de facto
                 standards",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "1--15",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908081",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "C remains central to our computing infrastructure. It
                 is notionally defined by ISO standards, but in reality
                 the properties of C assumed by systems code and those
                 implemented by compilers have diverged, both from the
                 ISO standards and from each other, and none of these
                 are clearly understood. We make two contributions to
                 help improve this error-prone situation. First, we
                 describe an in-depth analysis of the design space for
                 the semantics of pointers and memory in C as it is used
                 in practice. We articulate many specific questions,
                 build a suite of semantic test cases, gather
                 experimental data from multiple implementations, and
                 survey what C experts believe about the de facto
                 standards. We identify questions where there is a
                 consensus (either following ISO or differing) and where
                 there are conflicts. We apply all this to an
                 experimental C implemented above capability hardware.
                 Second, we describe a formal model, Cerberus, for large
                 parts of C. Cerberus is parameterised on its memory
                 model; it is linkable either with a candidate de facto
                 memory object model, under construction, or with an
                 operational C11 concurrency model; it is defined by
                 elaboration to a much simpler Core language for
                 accessibility, and it is executable as a test oracle on
                 small examples. This should provide a solid basis for
                 discussion of what mainstream C is now: what
                 programmers and analysis tools can assume and what
                 compilers aim to implement. Ultimately we hope it will
                 be a step towards clear, consistent, and accepted
                 semantics for the various use-cases of C.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Chamith:2016:LER,
  author =       "Buddhika Chamith and Bo Joel Svensson and Luke
                 Dalessandro and Ryan R. Newton",
  title =        "Living on the edge: rapid-toggling probes with
                 cross-modification on x86",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "16--26",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908084",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic probe injection is now a widely used method to
                 debug performance in production. Current techniques for
                 dynamic probing of native code, however, rely on an
                 expensive stop-the-world approach: binary changes are
                 made within a safe state of the program --- typically
                 in which all the program threads are halted --- to
                 ensure that another thread executing the modified code
                 region doesn't step into a partially-modified code.
                 Stop-the-world patching is not scalable. In contrast,
                 low overhead, scalable probes that can be rapidly
                 toggled on and off in-place would open up new use cases
                 for statistical profilers and language implementations,
                 even traditional ahead-of-time, native-code compilers.
                 In this paper we introduce safe cross-modification
                 protocols that mutate x86 code between threads but do
                 not require quiescing threads, resulting in radically
                 lower overheads than existing solutions. A key problem
                 is handling instructions that straddle cache lines. We
                 empirically evaluate existing x86 architectures to
                 derive a safe policy given current processor behavior,
                 and we argue that future architectures should clarify
                 the semantics of instruction fetching to make cheap
                 cross-modification easier and future proof.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Noonan:2016:PTI,
  author =       "Matt Noonan and Alexey Loginov and David Cok",
  title =        "Polymorphic type inference for machine code",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "27--41",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908119",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For many compiled languages, source-level types are
                 erased very early in the compilation process. As a
                 result, further compiler passes may convert type-safe
                 source into type-unsafe machine code. Type-unsafe
                 idioms in the original source and type-unsafe
                 optimizations mean that type information in a stripped
                 binary is essentially nonexistent. The problem of
                 recovering high-level types by performing type
                 inference over stripped machine code is called type
                 reconstruction, and offers a useful capability in
                 support of reverse engineering and decompilation. In
                 this paper, we motivate and develop a novel type system
                 and algorithm for machine-code type inference. The
                 features of this type system were developed by
                 surveying a wide collection of common source- and
                 machine-code idioms, building a catalog of challenging
                 cases for type reconstruction. We found that these
                 idioms place a sophisticated set of requirements on the
                 type system, inducing features such as
                 recursively-constrained polymorphic types. Many of the
                 features we identify are often seen only in expressive
                 and powerful type systems used by high-level functional
                 languages. Using these type-system features as a
                 guideline, we have developed Retypd: a novel static
                 type-inference algorithm for machine code that supports
                 recursive types, polymorphism, and subtyping. Retypd
                 yields more accurate inferred types than existing
                 algorithms, while also enabling new capabilities such
                 as reconstruction of pointer const annotations with
                 98\% recall. Retypd can operate on weaker program
                 representations than the current state of the art,
                 removing the need for high-quality points-to
                 information that may be impractical to compute.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Padhi:2016:DDP,
  author =       "Saswat Padhi and Rahul Sharma and Todd Millstein",
  title =        "Data-driven precondition inference with learned
                 features",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "42--56",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908099",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We extend the data-driven approach to inferring
                 preconditions for code from a set of test executions.
                 Prior work requires a fixed set of features, atomic
                 predicates that define the search space of possible
                 preconditions, to be specified in advance. In contrast,
                 we introduce a technique for on-demand feature
                 learning, which automatically expands the search space
                 of candidate preconditions in a targeted manner as
                 necessary. We have instantiated our approach in a tool
                 called PIE. In addition to making precondition
                 inference more expressive, we show how to apply our
                 feature-learning technique to the setting of
                 data-driven loop invariant inference. We evaluate our
                 approach by using PIE to infer rich preconditions for
                 black-box OCaml library functions and using our
                 loop-invariant inference algorithm as part of an
                 automatic program verifier for C++ programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Sousa:2016:CHL,
  author =       "Marcelo Sousa and Isil Dillig",
  title =        "{Cartesian} {Hoare} logic for verifying $k$-safety
                 properties",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "57--69",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908092",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Unlike safety properties which require the absence of
                 a ``bad'' program trace, k-safety properties stipulate
                 the absence of a ``bad'' interaction between $k$
                 traces. Examples of $k$-safety properties include
                 transitivity, associativity, anti-symmetry, and
                 monotonicity. This paper presents a sound and
                 relatively complete calculus, called Cartesian Hoare
                 Logic (CHL), for verifying $k$-safety properties. We
                 also present an automated verification algorithm based
                 on CHL and implement it in a tool called DESCARTES. We
                 use DESCARTES to analyze user-defined relational
                 operators in Java and demonstrate that DESCARTES is
                 effective at verifying (or finding violations of)
                 multiple $k$-safety properties.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Lee:2016:VBM,
  author =       "Wonyeol Lee and Rahul Sharma and Alex Aiken",
  title =        "Verifying bit-manipulations of floating-point",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "70--84",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908107",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reasoning about floating-point is difficult and
                 becomes only more so if there is an interplay between
                 floating-point and bit-level operations. Even though
                 real-world floating-point libraries use implementations
                 that have such mixed computations, no systematic
                 technique to verify the correctness of the
                 implementations of such computations is known. In this
                 paper, we present the first general technique for
                 verifying the correctness of mixed binaries, which
                 combines abstraction, analytical optimization, and
                 testing. The technique provides a method to compute an
                 error bound of a given implementation with respect to
                 its mathematical specification. We apply our technique
                 to Intel's implementations of transcendental functions
                 and prove formal error bounds for these widely used
                 routines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Chen:2016:CDD,
  author =       "Yuting Chen and Ting Su and Chengnian Sun and Zhendong
                 Su and Jianjun Zhao",
  title =        "Coverage-directed differential testing of {JVM}
                 implementations",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "85--99",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908095",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Java virtual machine (JVM) is a core technology, whose
                 reliability is critical. Testing JVM implementations
                 requires painstaking effort in designing test
                 classfiles (*.class) along with their test oracles. An
                 alternative is to employ binary fuzzing to
                 differentially test JVMs by blindly mutating seeding
                 classfiles and then executing the resulting mutants on
                 different JVM binaries for revealing inconsistent
                 behaviors. However, this blind approach is not cost
                 effective in practice because most of the mutants are
                 invalid and redundant. This paper tackles this
                 challenge by introducing classfuzz, a coverage-directed
                 fuzzing approach that focuses on representative
                 classfiles for differential testing of JVMs' startup
                 processes. Our core insight is to (1) mutate seeding
                 classfiles using a set of predefined mutation operators
                 (mutators) and employ Markov Chain Monte Carlo (MCMC)
                 sampling to guide mutator selection, and (2) execute
                 the mutants on a reference JVM implementation and use
                 coverage uniqueness as a discipline for accepting
                 representative ones. The accepted classfiles are used
                 as inputs to differentially test different JVM
                 implementations and find defects. We have implemented
                 classfuzz and conducted an extensive evaluation of it
                 against existing fuzz testing algorithms. Our
                 evaluation results show that classfuzz can enhance the
                 ratio of discrepancy-triggering classfiles from 1.7\%
                 to 11.9\%. We have also reported 62 JVM discrepancies,
                 along with the test classfiles, to JVM developers. Many
                 of our reported issues have already been confirmed as
                 JVM defects, and some even match recent clarifications
                 and changes to the Java SE 8 edition of the JVM
                 specification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Sorensen:2016:EER,
  author =       "Tyler Sorensen and Alastair F. Donaldson",
  title =        "Exposing errors related to weak memory in {GPU}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "100--113",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908114",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the systematic design of a testing
                 environment that uses stressing and fuzzing to reveal
                 errors in GPU applications that arise due to weak
                 memory effects. We evaluate our approach on seven GPUs
                 spanning three Nvidia architectures, across ten CUDA
                 applications that use fine-grained concurrency. Our
                 results show that applications that rarely or never
                 exhibit errors related to weak memory when executed
                 natively can readily exhibit these errors when executed
                 in our testing environment. Our testing environment
                 also provides a means to help identify the root causes
                 of such errors, and automatically suggests how to
                 insert fences that harden an application against weak
                 memory bugs. To understand the cost of GPU fences, we
                 benchmark applications with fences provided by the
                 hardening strategy as well as a more conservative,
                 sound fencing strategy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Faddegon:2016:LCT,
  author =       "Maarten Faddegon and Olaf Chitil",
  title =        "Lightweight computation tree tracing for lazy
                 functional languages",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "114--128",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908104",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A computation tree of a program execution describes
                 computations of functions and their dependencies. A
                 computation tree describes how a program works and is
                 at the heart of algorithmic debugging. To generate a
                 computation tree, existing algorithmic debuggers either
                 use a complex implementation or yield a less
                 informative approximation. We present a method for lazy
                 functional languages that requires only a simple
                 tracing library to generate a detailed computation
                 tree. With our algorithmic debugger a programmer can
                 debug any Haskell program by only importing our library
                 and annotating suspected functions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Hong:2016:EPM,
  author =       "Changwan Hong and Wenlei Bao and Albert Cohen and
                 Sriram Krishnamoorthy and Louis-No{\"e}l Pouchet and
                 Fabrice Rastello and J. Ramanujam and P. Sadayappan",
  title =        "Effective padding of multidimensional arrays to avoid
                 cache conflict misses",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "129--144",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908123",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Caches are used to significantly improve performance.
                 Even with high degrees of set associativity, the number
                 of accessed data elements mapping to the same set in a
                 cache can easily exceed the degree of associativity.
                 This can cause conflict misses and lower performance,
                 even if the working set is much smaller than cache
                 capacity. Array padding (increasing the size of array
                 dimensions) is a well-known optimization technique that
                 can reduce conflict misses. In this paper, we develop
                 the first algorithms for optimal padding of arrays
                 aimed at a set-associative cache for arbitrary tile
                 sizes. In addition, we develop the first solution to
                 padding for nested tiles and multi-level caches.
                 Experimental results with multiple benchmarks
                 demonstrate a significant performance improvement from
                 padding.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Zhu:2016:GLE,
  author =       "Yuhao Zhu and Vijay Janapa Reddi",
  title =        "{GreenWeb}: language extensions for energy-efficient
                 mobile web computing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "145--160",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908082",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Web computing is gradually shifting toward mobile
                 devices, in which the energy budget is severely
                 constrained. As a result, Web developers must be
                 conscious of energy efficiency. However, current Web
                 languages provide developers little control over energy
                 consumption. In this paper, we take a first step toward
                 language-level research to enable energy-efficient Web
                 computing. Our key motivation is that mobile systems
                 can wisely budget energy usage if informed with user
                 quality-of-service (QoS) constraints. To do this,
                 programmers need new abstractions. We propose two
                 language abstractions, QoS type and QoS target, to
                 capture two fundamental aspects of user QoS experience.
                 We then present GreenWeb, a set of language extensions
                 that empower developers to easily express the QoS
                 abstractions as program annotations. As a proof of
                 concept, we develop a GreenWeb runtime, which
                 intelligently determines how to deliver specified user
                 QoS expectation while minimizing energy consumption.
                 Overall, GreenWeb shows significant energy savings
                 (29.2\% ~ 66.0\%) over Android's default Interactive
                 governor with few QoS violations. Our work demonstrates
                 a promising first step toward language innovations for
                 energy-efficient Web computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Laurenzano:2016:IRU,
  author =       "Michael A. Laurenzano and Parker Hill and Mehrzad
                 Samadi and Scott Mahlke and Jason Mars and Lingjia
                 Tang",
  title =        "Input responsiveness: using canary inputs to
                 dynamically steer approximation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "161--176",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908087",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces Input Responsive Approximation
                 (IRA), an approach that uses a canary input --- a small
                 program input carefully constructed to capture the
                 intrinsic properties of the original input --- to
                 automatically control how program approximation is
                 applied on an input-by-input basis. Motivating this
                 approach is the observation that many of the prior
                 techniques focusing on choosing how to approximate
                 arrive at conservative decisions by discounting
                 substantial differences between inputs when applying
                 approximation. The main challenges in overcoming this
                 limitation lie in making the choice of how to
                 approximate both effectively (e.g., the fastest
                 approximation that meets a particular accuracy target)
                 and rapidly for every input. With IRA, each time the
                 approximate program is run, a canary input is
                 constructed and used dynamically to quickly test a
                 spectrum of approximation alternatives. Based on these
                 runtime tests, the approximation that best fits the
                 desired accuracy constraints is selected and applied to
                 the full input to produce an approximate result. We use
                 IRA to select and parameterize mixes of four
                 approximation techniques from the literature for a
                 range of 13 image processing, machine learning, and
                 data mining applications. Our results demonstrate that
                 IRA significantly outperforms prior approaches,
                 delivering an average of 10.2$ \times $ speedup over
                 exact execution while minimizing accuracy losses in
                 program outputs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Achour:2016:CSP,
  author =       "Sara Achour and Rahul Sarpeshkar and Martin C.
                 Rinard",
  title =        "Configuration synthesis for programmable analog
                 devices with {Arco}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "177--193",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908116",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmable analog devices have emerged as a powerful
                 computing substrate for performing complex neuromorphic
                 and cytomorphic computations. We present Arco, a new
                 solver that, given a dynamical system specification in
                 the form of a set of differential equations, generates
                 physically realizable configurations for programmable
                 analog devices that are algebraically equivalent to the
                 specified system. On a set of benchmarks from the
                 biological domain, Arco generates configurations with
                 35 to 534 connections and 28 to 326 components in 1 to
                 54 minutes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Madsen:2016:DFD,
  author =       "Magnus Madsen and Ming-Ho Yee and Ondrej Lhot{\'a}k",
  title =        "From {Datalog} to {Flix}: a declarative language for
                 fixed points on lattices",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "194--208",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908096",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Flix, a declarative programming language
                 for specifying and solving least fixed point problems,
                 particularly static program analyses. Flix is inspired
                 by Datalog and extends it with lattices and monotone
                 functions. Using Flix, implementors of static analyses
                 can express a broader range of analyses than is
                 currently possible in pure Datalog, while retaining its
                 familiar rule-based syntax. We define a model-theoretic
                 semantics of Flix as a natural extension of the Datalog
                 semantics. This semantics captures the declarative
                 meaning of Flix programs without imposing any specific
                 evaluation strategy. An efficient strategy is
                 semi-naive evaluation which we adapt for Flix. We have
                 implemented a compiler and runtime for Flix, and used
                 it to express several well-known static analyses,
                 including the IFDS and IDE algorithms. The declarative
                 nature of Flix clearly exposes the similarity between
                 these two algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Truong:2016:LLC,
  author =       "Leonard Truong and Rajkishore Barik and Ehsan Totoni
                 and Hai Liu and Chick Markley and Armando Fox and
                 Tatiana Shpeisman",
  title =        "{Latte}: a language, compiler, and runtime for elegant
                 and efficient deep neural networks",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "209--223",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908105",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Deep neural networks (DNNs) have undergone a surge in
                 popularity with consistent advances in the state of the
                 art for tasks including image recognition, natural
                 language processing, and speech recognition. The
                 computationally expensive nature of these networks has
                 led to the proliferation of implementations that
                 sacrifice abstraction for high performance. In this
                 paper, we present Latte, a domain-specific language for
                 DNNs that provides a natural abstraction for specifying
                 new layers without sacrificing performance. Users of
                 Latte express DNNs as ensembles of neurons with
                 connections between them. The Latte compiler
                 synthesizes a program based on the user specification,
                 applies a suite of domain-specific and general
                 optimizations, and emits efficient machine code for
                 heterogeneous architectures. Latte also includes a
                 communication runtime for distributed memory
                 data-parallelism. Using networks described using Latte,
                 we demonstrate 3-6x speedup over Caffe (C++/MKL) on the
                 three state-of-the-art ImageNet models executing on an
                 Intel Xeon E5-2699 v3 x86 CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Adams:2016:CPP,
  author =       "Michael D. Adams and Celeste Hollenbeck and Matthew
                 Might",
  title =        "On the complexity and performance of parsing with
                 derivatives",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "224--236",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908128",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Current algorithms for context-free parsing inflict a
                 trade-off between ease of understanding, ease of
                 implementation, theoretical complexity, and practical
                 performance. No algorithm achieves all of these
                 properties simultaneously. Might et al. introduced
                 parsing with derivatives, which handles arbitrary
                 context-free grammars while being both easy to
                 understand and simple to implement. Despite much
                 initial enthusiasm and a multitude of independent
                 implementations, its worst-case complexity has never
                 been proven to be better than exponential. In fact,
                 high-level arguments claiming it is fundamentally
                 exponential have been advanced and even accepted as
                 part of the folklore. Performance ended up being
                 sluggish in practice, and this sluggishness was taken
                 as informal evidence of exponentiality. In this paper,
                 we reexamine the performance of parsing with
                 derivatives. We have discovered that it is not
                 exponential but, in fact, cubic. Moreover, simple
                 (though perhaps not obvious) modifications to the
                 implementation by Might et al. lead to an
                 implementation that is not only easy to understand but
                 also highly performant in practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Heule:2016:SSA,
  author =       "Stefan Heule and Eric Schkufza and Rahul Sharma and
                 Alex Aiken",
  title =        "Stratified synthesis: automatically learning the
                 x86-64 instruction set",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "237--250",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908121",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The x86-64 ISA sits at the bottom of the software
                 stack of most desktop and server software. Because of
                 its importance, many software analysis and verification
                 tools depend, either explicitly or implicitly, on
                 correct modeling of the semantics of x86-64
                 instructions. However, formal semantics for the x86-64
                 ISA are difficult to obtain and often written manually
                 through great effort. We describe an automatically
                 synthesized formal semantics of the input/output
                 behavior for a large fraction of the x86-64 Haswell
                 ISA's many thousands of instruction variants. The key
                 to our results is stratified synthesis, where we use a
                 set of instructions whose semantics are known to
                 synthesize the semantics of additional instructions
                 whose semantics are unknown. As the set of formally
                 described instructions increases, the synthesis
                 vocabulary expands, making it possible to synthesize
                 the semantics of increasingly complex instructions.
                 Using this technique we automatically synthesized
                 formal semantics for 1,795 instruction variants of the
                 x86-64 Haswell ISA. We evaluate the learned semantics
                 against manually written semantics (where available)
                 and find that they are formally equivalent with the
                 exception of 50 instructions, where the manually
                 written semantics contain an error. We further find the
                 learned formulas to be largely as precise as manually
                 written ones and of similar size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Eizenberg:2016:ROD,
  author =       "Ariel Eizenberg and Shiliang Hu and Gilles Pokam and
                 Joseph Devietti",
  title =        "{Remix}: online detection and repair of cache
                 contention for the {JVM}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "251--265",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908090",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As ever more computation shifts onto multicore
                 architectures, it is increasingly critical to find
                 effective ways of dealing with multithreaded
                 performance bugs like true and false sharing. Previous
                 approaches to fixing false sharing in unmanaged
                 languages have employed highly-invasive runtime program
                 modifications. We observe that managed language
                 runtimes, with garbage collection and JIT code
                 compilation, present unique opportunities to repair
                 such bugs directly, mirroring the techniques used in
                 manual repairs. We present Remix, a modified version of
                 the Oracle HotSpot JVM which can detect cache
                 contention bugs and repair false sharing at runtime.
                 Remix's detection mechanism leverages recent
                 performance counter improvements on Intel platforms,
                 which allow for precise, unobtrusive monitoring of
                 cache contention at the hardware level. Remix can
                 detect and repair known false sharing issues in the
                 LMAX Disruptor high-performance inter-thread messaging
                 library and the Spring Reactor event-processing
                 framework, automatically providing 1.5-2x speedups over
                 unoptimized code and matching the performance of
                 hand-optimization. Remix also finds a new false sharing
                 bug in SPECjvm2008, and uncovers a true sharing bug in
                 the HotSpot JVM that, when fixed, improves the
                 performance of three NAS Parallel Benchmarks by 7-25x.
                 Remix incurs no statistically-significant performance
                 overhead on other benchmarks that do not exhibit cache
                 contention, making Remix practical for always-on use.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{David:2016:SSB,
  author =       "Yaniv David and Nimrod Partush and Eran Yahav",
  title =        "Statistical similarity of binaries",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "266--280",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908126",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We address the problem of finding similar procedures
                 in stripped binaries. We present a new statistical
                 approach for measuring the similarity between two
                 procedures. Our notion of similarity allows us to find
                 similar code even when it has been compiled using
                 different compilers, or has been modified. The main
                 idea is to use similarity by composition: decompose the
                 code into smaller comparable fragments, define semantic
                 similarity between fragments, and use statistical
                 reasoning to lift fragment similarity into similarity
                 between procedures. We have implemented our approach in
                 a tool called Esh, and applied it to find various
                 prominent vulnerabilities across compilers and
                 versions, including Heartbleed, Shellshock and Venom.
                 We show that Esh produces high accuracy results, with
                 few to no false positives --- a crucial factor in the
                 scenario of vulnerability search in stripped
                 binaries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Zhang:2016:ABS,
  author =       "Yizhou Zhang and Guido Salvaneschi and Quinn Beightol
                 and Barbara Liskov and Andrew C. Myers",
  title =        "Accepting blame for safe tunneled exceptions",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "281--295",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908086",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Unhandled exceptions crash programs, so a compile-time
                 check that exceptions are handled should in principle
                 make software more reliable. But designers of some
                 recent languages have argued that the benefits of
                 statically checked exceptions are not worth the costs.
                 We introduce a new statically checked exception
                 mechanism that addresses the problems with existing
                 checked-exception mechanisms. In particular, it
                 interacts well with higher-order functions and other
                 design patterns. The key insight is that whether an
                 exception should be treated as a ``checked'' exception
                 is not a property of its type but rather of the context
                 in which the exception propagates. Statically checked
                 exceptions can ``tunnel'' through code that is
                 oblivious to their presence, but the type system
                 nevertheless checks that these exceptions are handled.
                 Further, exceptions can be tunneled without being
                 accidentally caught, by expanding the space of
                 exception identifiers to identify the
                 exception-handling context. The resulting mechanism is
                 expressive and syntactically light, and can be
                 implemented efficiently. We demonstrate the
                 expressiveness of the mechanism using significant
                 codebases and evaluate its performance. We have
                 implemented this new exception mechanism as part of the
                 new Genus programming language, but the mechanism could
                 equally well be applied to other programming
                 languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Kent:2016:OTM,
  author =       "Andrew M. Kent and David Kempe and Sam
                 Tobin-Hochstadt",
  title =        "Occurrence typing modulo theories",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "296--309",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908091",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new type system combining occurrence
                 typing --- a technique previously used to type check
                 programs in dynamically-typed languages such as Racket,
                 Clojure, and JavaScript --- with dependent refinement
                 types. We demonstrate that the addition of refinement
                 types allows the integration of arbitrary solver-backed
                 reasoning about logical propositions from external
                 theories. By building on occurrence typing, we can add
                 our enriched type system as a natural extension of
                 Typed Racket, reusing its core while increasing its
                 expressiveness. The result is a well-tested type system
                 with a conservative, decidable core in which types may
                 depend on a small but extensible set of program terms.
                 In addition to describing our design, we present the
                 following: a formal model and proof of correctness; a
                 strategy for integrating new theories, with specific
                 examples including linear arithmetic and bitvectors;
                 and an evaluation in the context of the full Typed
                 Racket implementation. Specifically, we take safe
                 vector operations as a case study, examining all vector
                 accesses in a 56,000 line corpus of Typed Racket
                 programs. Our system is able to prove that 50\% of
                 these are safe with no new annotations, and with a few
                 annotations and modifications we capture more than
                 70\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Vekris:2016:RTT,
  author =       "Panagiotis Vekris and Benjamin Cosman and Ranjit
                 Jhala",
  title =        "Refinement types for {TypeScript}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "310--325",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908110",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Refined TypeScript (RSC), a lightweight
                 refinement type system for TypeScript, that enables
                 static verification of higher-order, imperative
                 programs. We develop a formal system for RSC that
                 delineates the interaction between refinement types and
                 mutability, and enables flow-sensitive reasoning by
                 translating input programs to an equivalent
                 intermediate SSA form. By establishing type safety for
                 the intermediate form, we prove safety for the input
                 programs. Next, we extend the core to account for
                 imperative and dynamic features of TypeScript,
                 including overloading, type reflection, ad hoc type
                 hierarchies and object initialization. Finally, we
                 evaluate RSC on a set of real-world benchmarks,
                 including parts of the Octane benchmarks, D3,
                 Transducers, and the TypeScript compiler. We show how
                 RSC successfully establishes a number of value
                 dependent properties, such as the safety of array
                 accesses and downcasts, while incurring a modest
                 overhead in type annotations and code restructuring.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Smith:2016:MPS,
  author =       "Calvin Smith and Aws Albarghouthi",
  title =        "{MapReduce} program synthesis",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "326--340",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908102",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "By abstracting away the complexity of distributed
                 systems, large-scale data processing
                 platforms-MapReduce, Hadoop, Spark, Dryad, etc.-have
                 provided developers with simple means for harnessing
                 the power of the cloud. In this paper, we ask whether
                 we can automatically synthesize MapReduce-style
                 distributed programs from input-output examples. Our
                 ultimate goal is to enable end users to specify
                 large-scale data analyses through the simple interface
                 of examples. We thus present a new algorithm and tool
                 for synthesizing programs composed of efficient
                 data-parallel operations that can execute on cloud
                 computing infrastructure. We evaluate our tool on a
                 range of real-world big-data analysis tasks and general
                 computations. Our results demonstrate the efficiency of
                 our approach and the small number of examples it
                 requires to synthesize correct, scalable programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Chugh:2016:PDM,
  author =       "Ravi Chugh and Brian Hempel and Mitchell Spradlin and
                 Jacob Albers",
  title =        "Programmatic and direct manipulation, together at
                 last",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "341--354",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908103",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Direct manipulation interfaces and programmatic
                 systems have distinct and complementary strengths. The
                 former provide intuitive, immediate visual feedback and
                 enable rapid prototyping, whereas the latter enable
                 complex, reusable abstractions. Unfortunately, existing
                 systems typically force users into just one of these
                 two interaction modes. We present a system called
                 Sketch-n-Sketch that integrates programmatic and direct
                 manipulation for the particular domain of Scalable
                 Vector Graphics (SVG). In Sketch-n-Sketch, the user
                 writes a program to generate an output SVG canvas. Then
                 the user may directly manipulate the canvas while the
                 system immediately infers a program update in order to
                 match the changes to the output, a workflow we call
                 live synchronization. To achieve this, we propose (i) a
                 technique called trace-based program synthesis that
                 takes program execution history into account in order
                 to constrain the search space and (ii) heuristics for
                 dealing with ambiguities. Based on our experience with
                 examples spanning 2,000 lines of code and from the
                 results of a preliminary user study, we believe that
                 Sketch-n-Sketch provides a novel workflow that can
                 augment traditional programming systems. Our approach
                 may serve as the basis for live synchronization in
                 other application domains, as well as a starting point
                 for yet more ambitious ways of combining programmatic
                 and direct manipulation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Loncaric:2016:FSF,
  author =       "Calvin Loncaric and Emina Torlak and Michael D.
                 Ernst",
  title =        "Fast synthesis of fast collections",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "355--368",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908122",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many applications require specialized data structures
                 not found in the standard libraries, but implementing
                 new data structures by hand is tedious and error-prone.
                 This paper presents a novel approach for synthesizing
                 efficient implementations of complex collection data
                 structures from high-level specifications that describe
                 the desired retrieval operations. Our approach handles
                 a wider range of data structures than previous work,
                 including structures that maintain an order among their
                 elements or have complex retrieval methods. We have
                 prototyped our approach in a data structure synthesizer
                 called Cozy. Four large, real-world case studies
                 compare structures generated by Cozy against
                 handwritten implementations in terms of correctness and
                 performance. Structures synthesized by Cozy match the
                 performance of handwritten data structures while
                 avoiding human error.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{McClurg:2016:EDN,
  author =       "Jedidiah McClurg and Hossein Hojjat and Nate Foster
                 and Pavol Cern{\'y}",
  title =        "Event-driven network programming",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "369--385",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908097",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software-defined networking (SDN) programs must
                 simultaneously describe static forwarding behavior and
                 dynamic updates in response to events. Event-driven
                 updates are critical to get right, but difficult to
                 implement correctly due to the high degree of
                 concurrency in networks. Existing SDN platforms offer
                 weak guarantees that can break application invariants,
                 leading to problems such as dropped packets, degraded
                 performance, security violations, etc. This paper
                 introduces EVENT-DRIVEN CONSISTENT UPDATES that are
                 guaranteed to preserve well-defined behaviors when
                 transitioning between configurations in response to
                 events. We propose NETWORK EVENT STRUCTURES (NESs) to
                 model constraints on updates, such as which events can
                 be enabled simultaneously and causal dependencies
                 between events. We define an extension of the NetKAT
                 language with mutable state, give semantics to stateful
                 programs using NESs, and discuss provably-correct
                 strategies for implementing NESs in SDNs. Finally, we
                 evaluate our approach empirically, demonstrating that
                 it gives well-defined consistency guarantees while
                 avoiding expensive synchronization and packet
                 buffering.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Beckett:2016:TN,
  author =       "Ryan Beckett and Michael Greenberg and David Walker",
  title =        "Temporal {NetKAT}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "386--401",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908108",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the past 5-10 years, the rise of software-defined
                 networking (SDN) has inspired a wide range of new
                 systems, libraries, hypervisors and languages for
                 programming, monitoring, and debugging network
                 behavior. Oftentimes, these systems are disjoint-one
                 language for programming and another for verification,
                 and yet another for run-time monitoring and debugging.
                 In this paper, we present a new, unified framework,
                 called Temporal NetKAT, capable of facilitating all of
                 these tasks at once. As its name suggests, Temporal
                 NetKAT is the synthesis of two formal theories:
                 past-time (finite trace) linear temporal logic and
                 (network) Kleene Algebra with Tests. Temporal
                 predicates allow programmers to write down concise
                 properties of a packet's path through the network and
                 to make dynamic packet-forwarding, access control or
                 debugging decisions on that basis. In addition to being
                 useful for programming, the combined equational theory
                 of LTL and NetKAT facilitates proofs of path-based
                 correctness properties. Using new, general, proof
                 techniques, we show that the equational semantics is
                 sound with respect to the denotational semantics, and,
                 for a class of programs we call network-wide programs,
                 complete. We have also implemented a compiler for
                 temporal NetKAT, evaluated its performance on a range
                 of benchmarks, and studied the effectiveness of several
                 optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{El-Hassany:2016:SCA,
  author =       "Ahmed El-Hassany and Jeremie Miserez and Pavol Bielik
                 and Laurent Vanbever and Martin Vechev",
  title =        "{SDNRacer}: concurrency analysis for software-defined
                 networks",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "402--415",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908124",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrency violations are an important source of bugs
                 in Software-Defined Networks (SDN), often leading to
                 policy or invariant violations. Unfortunately,
                 concurrency violations are also notoriously difficult
                 to avoid, detect and debug. This paper presents a novel
                 approach and a tool, SDNRacer, for detecting
                 concurrency violations of SDNs. Our approach is enabled
                 by three key ingredients: (i) a precise happens-before
                 model for SDNs that captures when events can happen
                 concurrently; (ii) a set of sound, domain-specific
                 filters that reduce reported violations by orders of
                 magnitude, and; (iii) a sound and complete dynamic
                 analyzer, based on the above, that can ensure the
                 network is free of harmful errors such as data races
                 and per-packet incoherence. We evaluated SDNRacer on
                 several real-world OpenFlow controllers, running both
                 reactive and proactive applications in large networks.
                 We show that SDNRacer is practically effective: it
                 quickly pinpoints harmful concurrency violations
                 without overwhelming the user with false positives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Shambaugh:2016:RCV,
  author =       "Rian Shambaugh and Aaron Weiss and Arjun Guha",
  title =        "Rehearsal: a configuration verification tool for
                 puppet",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "416--430",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908083",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large-scale data centers and cloud computing have
                 turned system configuration into a challenging problem.
                 Several widely-publicized outages have been blamed not
                 on software bugs, but on configuration bugs. To cope,
                 thousands of organizations use system configuration
                 languages to manage their computing infrastructure. Of
                 these, Puppet is the most widely used with thousands of
                 paying customers and many more open-source users. The
                 heart of Puppet is a domain-specific language that
                 describes the state of a system. Puppet already
                 performs some basic static checks, but they only
                 prevent a narrow range of errors. Furthermore, testing
                 is ineffective because many errors are only triggered
                 under specific machine states that are difficult to
                 predict and reproduce. With several examples, we show
                 that a key problem with Puppet is that configurations
                 can be non-deterministic. This paper presents
                 Rehearsal, a verification tool for Puppet
                 configurations. Rehearsal implements a sound, complete,
                 and scalable determinacy analysis for Puppet. To
                 develop it, we (1) present a formal semantics for
                 Puppet, (2) use several analyses to shrink our models
                 to a tractable size, and (3) frame determinism-checking
                 as decidable formulas for an SMT solver. Rehearsal then
                 leverages the determinacy analysis to check other
                 important properties, such as idempotency. Finally, we
                 apply Rehearsal to several real-world Puppet
                 configurations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Chen:2016:TCV,
  author =       "Hao Chen and Xiongnan (Newman) Wu and Zhong Shao and
                 Joshua Lockerman and Ronghui Gu",
  title =        "Toward compositional verification of interruptible
                 {OS} kernels and device drivers",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "431--447",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908101",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "An operating system (OS) kernel forms the lowest level
                 of any system software stack. The correctness of the OS
                 kernel is the basis for the correctness of the entire
                 system. Recent efforts have demonstrated the
                 feasibility of building formally verified
                 general-purpose kernels, but it is unclear how to
                 extend their work to verify the functional correctness
                 of device drivers, due to the non-local effects of
                 interrupts. In this paper, we present a novel
                 compositional framework for building certified
                 interruptible OS kernels with device drivers. We
                 provide a general device model that can be instantiated
                 with various hardware devices, and a realistic formal
                 model of interrupts, which can be used to reason about
                 interruptible code. We have realized this framework in
                 the Coq proof assistant. To demonstrate the
                 effectiveness of our new approach, we have successfully
                 extended an existing verified non-interruptible kernel
                 with our framework and turned it into an interruptible
                 kernel with verified device drivers. To the best of our
                 knowledge, this is the first verified interruptible
                 operating system with device drivers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Mullen:2016:VPO,
  author =       "Eric Mullen and Daryl Zuniga and Zachary Tatlock and
                 Dan Grossman",
  title =        "Verified peephole optimizations for {CompCert}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "448--461",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908109",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transformations over assembly code are common in many
                 compilers. These transformations are also some of the
                 most bug-dense compiler components. Such bugs could be
                 eliminated by formally verifying the compiler, but
                 state-of-the-art formally verified compilers like
                 CompCert do not support assembly-level program
                 transformations. This paper presents Peek, a framework
                 for expressing, verifying, and running
                 meaning-preserving assembly-level program
                 transformations in CompCert. Peek contributes four new
                 components: a lower level semantics for CompCert x86
                 syntax, a liveness analysis, a library for expressing
                 and verifying peephole optimizations, and a verified
                 peephole optimization pass built into CompCert. Each of
                 these is accompanied by a correctness proof in Coq
                 against realistic assumptions about the calling
                 convention and the system memory allocator. Verifying
                 peephole optimizations in Peek requires proving only a
                 set of local properties, which we have proved are
                 sufficient to ensure global transformation correctness.
                 We have proven these local properties for 28 peephole
                 transformations from the literature. We discuss the
                 development of our new assembly semantics, liveness
                 analysis, representation of program transformations,
                 and execution engine; describe the verification
                 challenges of each component; and detail techniques we
                 applied to mitigate the proof burden.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Ren:2016:JTS,
  author =       "Brianna M. Ren and Jeffrey S. Foster",
  title =        "Just-in-time static type checking for dynamic
                 languages",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "462--476",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908127",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic languages such as Ruby, Python, and JavaScript
                 have many compelling benefits, but the lack of static
                 types means subtle errors can remain latent in code for
                 a long time. While many researchers have developed
                 various systems to bring some of the benefits of static
                 types to dynamic languages, prior approaches have
                 trouble dealing with metaprogramming, which generates
                 code as the program executes. In this paper, we propose
                 Hummingbird, a new system that uses a novel technique,
                 just-in-time static type checking, to type check Ruby
                 code even in the presence of metaprogramming. In
                 Hummingbird, method type signatures are gathered
                 dynamically at run-time, as those methods are created.
                 When a method is called, Hummingbird statically type
                 checks the method body against current type signatures.
                 Thus, Hummingbird provides thorough static checks on a
                 per-method basis, while also allowing arbitrarily
                 complex metaprogramming. For performance, Hummingbird
                 memoizes the static type checking pass, invalidating
                 cached checks only if necessary. We formalize
                 Hummingbird using a core, Ruby-like language and prove
                 it sound. To evaluate Hummingbird, we applied it to six
                 apps, including three that use Ruby on Rails, a
                 powerful framework that relies heavily on
                 metaprogramming. We found that all apps typecheck
                 successfully using Hummingbird, and that Hummingbird's
                 performance overhead is reasonable. We applied
                 Hummingbird to earlier versions of one Rails app and
                 found several type errors that had been introduced and
                 then fixed. Lastly, we demonstrate using Hummingbird in
                 Rails development mode to typecheck an app as live
                 updates are applied to it.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Petricek:2016:TDM,
  author =       "Tomas Petricek and Gustavo Guerra and Don Syme",
  title =        "Types from data: making structured data first-class
                 citizens in {F\#}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "477--490",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908115",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most modern applications interact with external
                 services and access data in structured formats such as
                 XML, JSON and CSV. Static type systems do not
                 understand such formats, often making data access more
                 cumbersome. Should we give up and leave the messy world
                 of external data to dynamic typing and runtime checks?
                 Of course, not! We present F\# Data, a library that
                 integrates external structured data into F\#. As most
                 real-world data does not come with an explicit schema,
                 we develop a shape inference algorithm that infers a
                 shape from representative sample documents. We then
                 integrate the inferred shape into the F\# type system
                 using type providers. We formalize the process and
                 prove a relative type soundness theorem. Our library
                 significantly reduces the amount of data access code
                 and it provides additional safety guarantees when
                 contrasted with the widely used weakly typed
                 techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Zhu:2016:ALS,
  author =       "He Zhu and Gustavo Petri and Suresh Jagannathan",
  title =        "Automatically learning shape specifications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "491--507",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908125",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a novel automated procedure for
                 discovering expressive shape specifications for
                 sophisticated functional data structures. Our approach
                 extracts potential shape predicates based on the
                 definition of constructors of arbitrary user-defined
                 inductive data types, and combines these predicates
                 within an expressive first-order specification language
                 using a lightweight data-driven learning procedure.
                 Notably, this technique requires no programmer
                 annotations, and is equipped with a type-based decision
                 procedure to verify the correctness of discovered
                 specifications. Experimental results indicate that our
                 implementation is both efficient and effective, capable
                 of automatically synthesizing sophisticated shape
                 specifications over a range of complex data types,
                 going well beyond the scope of existing solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Yaghmazadeh:2016:STH,
  author =       "Navid Yaghmazadeh and Christian Klinger and Isil
                 Dillig and Swarat Chaudhuri",
  title =        "Synthesizing transformations on hierarchically
                 structured data",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "508--521",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908088",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a new approach for synthesizing
                 transformations on tree-structured data, such as Unix
                 directories and XML documents. We consider a general
                 abstraction for such data, called hierarchical data
                 trees (HDTs) and present a novel example-driven
                 synthesis algorithm for HDT transformations. Our
                 central insight is to reduce the problem of
                 synthesizing tree transformers to the synthesis of list
                 transformations that are applied to the paths of the
                 tree. The synthesis problem over lists is solved using
                 a new algorithm that combines SMT solving and decision
                 tree learning. We have implemented our technique in a
                 system called HADES and show that HADES can
                 automatically synthesize a variety of interesting
                 transformations collected from online forums.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Polikarpova:2016:PSP,
  author =       "Nadia Polikarpova and Ivan Kuraj and Armando
                 Solar-Lezama",
  title =        "Program synthesis from polymorphic refinement types",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "522--538",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908093",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a method for synthesizing recursive
                 functions that provably satisfy a given specification
                 in the form of a polymorphic refinement type. We
                 observe that such specifications are particularly
                 suitable for program synthesis for two reasons. First,
                 they offer a unique combination of expressive power and
                 decidability, which enables automatic verification-and
                 hence synthesis-of nontrivial programs. Second, a
                 type-based specification for a program can often be
                 effectively decomposed into independent specifications
                 for its components, causing the synthesizer to consider
                 fewer component combinations and leading to a
                 combinatorial reduction in the size of the search
                 space. At the core of our synthesis procedure is a new
                 algorithm for refinement type checking, which supports
                 specification decomposition. We have evaluated our
                 prototype implementation on a large set of synthesis
                 problems and found that it exceeds the state of the art
                 in terms of both scalability and usability. The tool
                 was able to synthesize more complex programs than those
                 reported in prior work (several sorting algorithms and
                 operations on balanced search trees), as well as most
                 of the benchmarks tackled by existing synthesizers,
                 often starting from a more concise and intuitive user
                 input.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Maleki:2016:HOT,
  author =       "Sepideh Maleki and Annie Yang and Martin Burtscher",
  title =        "Higher-order and tuple-based massively-parallel prefix
                 sums",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "539--552",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908089",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Prefix sums are an important parallel primitive,
                 especially in massively-parallel programs. This paper
                 discusses two orthogonal generalizations thereof, which
                 we call higher-order and tuple-based prefix sums.
                 Moreover, it describes and evaluates SAM, a
                 GPU-friendly algorithm for computing prefix sums and
                 other scans that directly supports higher orders and
                 tuple values. Its templated CUDA implementation unifies
                 all of these computations in a single 100-statement
                 kernel. SAM is communication-efficient in the sense
                 that it minimizes main-memory accesses. When computing
                 prefix sums of a million or more values, it outperforms
                 Thrust and CUDPP on both a Titan X and a K40 GPU. On
                 the Titan X, SAM reaches memory-copy speeds for large
                 input sizes, which cannot be surpassed. SAM outperforms
                 CUB, the currently fastest conventional prefix sum
                 implementation, by up to a factor of 2.9 on
                 eighth-order prefix sums and by up to a factor of 2.6
                 on eight-tuple prefix sums.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Kim:2016:DOF,
  author =       "Junghyun Kim and Gangwon Jo and Jaehoon Jung and
                 Jungwon Kim and Jaejin Lee",
  title =        "A distributed {OpenCL} framework using redundant
                 computation and data replication",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "553--569",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908094",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Applications written solely in OpenCL or CUDA cannot
                 execute on a cluster as a whole. Most previous
                 approaches that extend these programming models to
                 clusters are based on a common idea: designating a
                 centralized host node and coordinating the other nodes
                 with the host for computation. However, the centralized
                 host node is a serious performance bottleneck when the
                 number of nodes is large. In this paper, we propose a
                 scalable and distributed OpenCL framework called
                 SnuCL-D for large-scale clusters. SnuCL-D's remote
                 device virtualization provides an OpenCL application
                 with an illusion that all compute devices in a cluster
                 are confined in a single node. To reduce the amount of
                 control-message and data communication between nodes,
                 SnuCL-D replicates the OpenCL host program execution
                 and data in each node. We also propose a new OpenCL
                 host API function and a queueing optimization technique
                 that significantly reduce the overhead incurred by the
                 previous centralized approaches. To show the
                 effectiveness of SnuCL-D, we evaluate SnuCL-D with a
                 microbenchmark and eleven benchmark applications on a
                 large-scale CPU cluster and a medium-scale GPU
                 cluster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Degenbaev:2016:ITG,
  author =       "Ulan Degenbaev and Jochen Eisinger and Manfred Ernst
                 and Ross McIlroy and Hannes Payer",
  title =        "Idle time garbage collection scheduling",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "570--583",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908106",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Efficient garbage collection is increasingly important
                 in today's managed language runtime systems that demand
                 low latency, low memory consumption, and high
                 throughput. Garbage collection may pause the
                 application for many milliseconds to identify live
                 memory, free unused memory, and compact fragmented
                 regions of memory, even when employing concurrent
                 garbage collection. In animation-based applications
                 that require 60 frames per second, these pause times
                 may be observable, degrading user experience. This
                 paper introduces idle time garbage collection
                 scheduling to increase the responsiveness of
                 applications by hiding expensive garbage collection
                 operations inside of small, otherwise unused idle
                 portions of the application's execution, resulting in
                 smoother animations. Additionally we take advantage of
                 idleness to reduce memory consumption while allowing
                 higher memory use when high throughput is required. We
                 implemented idle time garbage collection scheduling in
                 V8, an open-source, production JavaScript virtual
                 machine running within Chrome. We present performance
                 results on various benchmarks running popular webpages
                 and show that idle time garbage collection scheduling
                 can significantly improve latency and memory
                 consumption. Furthermore, we introduce a new metric
                 called frame time discrepancy to quantify the quality
                 of the user experience and precisely measure the
                 improvements that idle time garbage collection provides
                 for a WebGL-based game benchmark. Idle time garbage
                 collection is shipped and enabled by default in
                 Chrome.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Jacek:2016:ALP,
  author =       "Nicholas Jacek and Meng-Chieh Chiu and Benjamin Marlin
                 and Eliot Moss",
  title =        "Assessing the limits of program-specific garbage
                 collection performance",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "584--598",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908120",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We consider the ultimate limits of program-specific
                 garbage collector performance for real programs. We
                 first characterize the GC schedule optimization problem
                 using Markov Decision Processes (MDPs). Based on this
                 characterization, we develop a method of determining,
                 for a given program run and heap size, an optimal
                 schedule of collections for a non-generational
                 collector. We further explore the limits of performance
                 of a generational collector, where it is not feasible
                 to search the space of schedules to prove optimality.
                 Still, we show significant improvements with Least
                 Squares Policy Iteration, a reinforcement learning
                 technique for solving MDPs. We demonstrate that there
                 is considerable promise to reduce garbage collection
                 costs by developing program-specific collection
                 policies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{vGleissenthall:2016:CUQ,
  author =       "Klaus v. Gleissenthall and Nikolaj Bj{\o}rner and
                 Andrey Rybalchenko",
  title =        "Cardinalities and universal quantifiers for verifying
                 parameterized systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "599--613",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908129",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parallel and distributed systems rely on intricate
                 protocols to manage shared resources and synchronize,
                 i.e., to manage how many processes are in a particular
                 state. Effective verification of such systems requires
                 universally quantification to reason about
                 parameterized state and cardinalities tracking sets of
                 processes, messages, failures to adequately capture
                 protocol logic. In this paper we present Tool, an
                 automatic invariant synthesis method that integrates
                 cardinality-based reasoning and universal
                 quantification. The resulting increase of
                 expressiveness allows Tool to verify, for the first
                 time, a representative collection of intricate
                 parameterized protocols.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Padon:2016:ISV,
  author =       "Oded Padon and Kenneth L. McMillan and Aurojit Panda
                 and Mooly Sagiv and Sharon Shoham",
  title =        "{Ivy}: safety verification by interactive
                 generalization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "614--630",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908118",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite several decades of research, the problem of
                 formal verification of infinite-state systems has
                 resisted effective automation. We describe a system ---
                 Ivy --- for interactively verifying safety of
                 infinite-state systems. Ivy's key principle is that
                 whenever verification fails, Ivy graphically displays a
                 concrete counterexample to induction. The user then
                 interactively guides generalization from this
                 counterexample. This process continues until an
                 inductive invariant is found. Ivy searches for
                 universally quantified invariants, and uses a
                 restricted modeling language. This ensures that all
                 verification conditions can be checked algorithmically.
                 All user interactions are performed using graphical
                 models, easing the user's task. We describe our initial
                 experience with verifying several distributed
                 protocols.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Yang:2016:PDI,
  author =       "Jean Yang and Travis Hance and Thomas H. Austin and
                 Armando Solar-Lezama and Cormac Flanagan and Stephen
                 Chong",
  title =        "Precise, dynamic information flow for database-backed
                 applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "631--647",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908098",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an approach for dynamic information flow
                 control across the application and database. Our
                 approach reduces the amount of policy code required,
                 yields formal guarantees across the application and
                 database, works with existing relational database
                 implementations, and scales for realistic applications.
                 In this paper, we present a programming model that
                 factors out information flow policies from application
                 code and database queries, a dynamic semantics for the
                 underlying $^J D B$ core language, and proofs of
                 termination-insensitive non-interference and policy
                 compliance for the semantics. We implement these ideas
                 in Jacqueline, a Python web framework, and demonstrate
                 feasibility through three application case studies: a
                 course manager, a health record system, and a
                 conference management system used to run an academic
                 workshop. We show that in comparison to traditional
                 applications with hand-coded policy checks, Jacqueline
                 applications have (1) a smaller trusted computing base,
                 (2) fewer lines of policy code, and (3) reasonable,
                 often negligible, additional overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Costanzo:2016:EEV,
  author =       "David Costanzo and Zhong Shao and Ronghui Gu",
  title =        "End-to-end verification of information-flow security
                 for {C} and assembly programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "648--664",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908100",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Protecting the confidentiality of information
                 manipulated by a computing system is one of the most
                 important challenges facing today's cybersecurity
                 community. A promising step toward conquering this
                 challenge is to formally verify that the end-to-end
                 behavior of the computing system really satisfies
                 various information-flow policies. Unfortunately,
                 because today's system software still consists of both
                 C and assembly programs, the end-to-end verification
                 necessarily requires that we not only prove the
                 security properties of individual components, but also
                 carefully preserve these properties through compilation
                 and cross-language linking. In this paper, we present a
                 novel methodology for formally verifying end-to-end
                 security of a software system that consists of both C
                 and assembly programs. We introduce a general
                 definition of observation function that unifies the
                 concepts of policy specification, state
                 indistinguishability, and whole-execution behaviors. We
                 show how to use different observation functions for
                 different levels of abstraction, and how to link
                 different security proofs across abstraction levels
                 using a special kind of simulation that is guaranteed
                 to preserve state indistinguishability. To demonstrate
                 the effectiveness of our new methodology, we have
                 successfully constructed an end-to-end security proof,
                 fully formalized in the Coq proof assistant, of a
                 nontrivial operating system kernel (running on an
                 extended CompCert x86 assembly machine model). Some
                 parts of the kernel are written in C and some are
                 written in assembly; we verify all of the code,
                 regardless of language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Sinha:2016:DVM,
  author =       "Rohit Sinha and Manuel Costa and Akash Lal and Nuno P.
                 Lopes and Sriram Rajamani and Sanjit A. Seshia and
                 Kapil Vaswani",
  title =        "A design and verification methodology for secure
                 isolated regions",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "665--681",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908113",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hardware support for isolated execution (such as Intel
                 SGX) enables development of applications that keep
                 their code and data confidential even while running in
                 a hostile or compromised host. However, automatically
                 verifying that such applications satisfy
                 confidentiality remains challenging. We present a
                 methodology for designing such applications in a way
                 that enables certifying their confidentiality. Our
                 methodology consists of forcing the application to
                 communicate with the external world through a narrow
                 interface, compiling it with runtime checks that aid
                 verification, and linking it with a small runtime that
                 implements the narrow interface. The runtime includes
                 services such as secure communication channels and
                 memory management. We formalize this restriction on the
                 application as Information Release Confinement (IRC),
                 and we show that it allows us to decompose the task of
                 proving confidentiality into (a) one-time,
                 human-assisted functional verification of the runtime
                 to ensure that it does not leak secrets, (b) automatic
                 verification of the application's machine code to
                 ensure that it satisfies IRC and does not directly read
                 or corrupt the runtime's internal state. We present
                 /CONFIDENTIAL: a verifier for IRC that is modular,
                 automatic, and keeps our compiler out of the trusted
                 computing base. Our evaluation suggests that the
                 methodology scales to real-world applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Spiegelman:2016:TDS,
  author =       "Alexander Spiegelman and Guy Golan-Gueta and Idit
                 Keidar",
  title =        "Transactional data structure libraries",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "682--696",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908112",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce transactions into libraries of concurrent
                 data structures; such transactions can be used to
                 ensure atomicity of sequences of data structure
                 operations. By focusing on transactional access to a
                 well-defined set of data structure operations, we
                 strike a balance between the ease-of-programming of
                 transactions and the efficiency of custom-tailored data
                 structures. We exemplify this concept by designing and
                 implementing a library supporting transactions on any
                 number of maps, sets (implemented as skiplists), and
                 queues. Our library offers efficient and scalable
                 transactions, which are an order of magnitude faster
                 than state-of-the-art transactional memory toolkits.
                 Moreover, our approach treats stand-alone data
                 structure operations (like put and enqueue) as first
                 class citizens, and allows them to execute with
                 virtually no overhead, at the speed of the original
                 data structure library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Baghsorkhi:2016:FAV,
  author =       "Sara S. Baghsorkhi and Nalini Vasudevan and Youfeng
                 Wu",
  title =        "{FlexVec}: auto-vectorization for irregular loops",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "697--710",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908111",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Traditional vectorization techniques build a
                 dependence graph with distance and direction
                 information to determine whether a loop is
                 vectorizable. Since vectorization reorders the
                 execution of instructions across iterations, in general
                 instructions involved in a strongly connected component
                 (SCC) are deemed not vectorizable unless the SCC can be
                 eliminated using techniques such as scalar expansion or
                 privatization. Therefore, traditional vectorization
                 techniques are limited in their ability to efficiently
                 handle loops with dynamic cross-iteration dependencies
                 or complex control flow interweaved within the
                 dependence cycles. When potential dependencies do not
                 occur very often, the end-result is under utilization
                 of the SIMD hardware. In this paper, we propose FlexVec
                 architecture that combines new vector instructions with
                 novel code generation techniques to dynamically adjusts
                 vector length for loop statements affected by
                 cross-iteration dependencies that happen at runtime. We
                 have designed and implemented FlexVec's new ISA as
                 extensions to the recently released AVX-512 ISA. We
                 have evaluated the performance improvements enabled by
                 FlexVec vectorization for 11 C/C++ SPEC 2006 benchmarks
                 and 7 real applications with AVX-512 vectorization as
                 baseline. We show that FlexVec vectorization technique
                 produces a Geomean speedup of 9\% for SPEC 2006 and a
                 Geomean speedup of 11\% for 7 real applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Kamil:2016:VLS,
  author =       "Shoaib Kamil and Alvin Cheung and Shachar Itzhaky and
                 Armando Solar-Lezama",
  title =        "Verified lifting of stencil computations",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "6",
  pages =        "711--726",
  month =        jun,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/2980983.2908117",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Mon Sep 5 07:32:25 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper demonstrates a novel combination of program
                 synthesis and verification to lift stencil computations
                 from low-level Fortran code to a high-level summary
                 expressed using a predicate language. The technique is
                 sound and mostly automated, and leverages
                 counter-example guided inductive synthesis (CEGIS) to
                 find provably correct translations. Lifting existing
                 code to a high-performance description language has a
                 number of benefits, including maintainability and
                 performance portability. For example, our experiments
                 show that the lifted summaries can enable domain
                 specific compilers to do a better job of
                 parallelization as compared to an off-the-shelf
                 compiler working on the original code, and can even
                 support fully automatic migration to hardware
                 accelerators such as GPUs. We have implemented verified
                 lifting in a system called STNG and have evaluated it
                 using microbenchmarks, mini-apps, and real-world
                 applications. We demonstrate the benefits of verified
                 lifting by first automatically summarizing Fortran
                 source code into a high-level predicate language, and
                 subsequently translating the lifted summaries into
                 Halide, with the translated code achieving median
                 performance speedups of 4.1X and up to 24X for
                 non-trivial stencils as compared to the original
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '16 conference proceedings.",
}

@Article{Chen:2017:BDA,
  author =       "Yunji Chen",
  title =        "Big Data Analytics and Intelligence at {Alibaba
                 Cloud}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "1--1",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037699",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As China's largest cloud service provider, Alibaba
                 Cloud has been one of the fastest growing cloud
                 computing platforms in the world. In this talk, I'll
                 present an overview of Big Data and AI computing
                 platform at Alibaba Cloud, which consists of a wide
                 range of products and services to enable fast and
                 efficient big data development and intelligent
                 analysis. The underlying computing infrastructure
                 supports a variety of computation scenarios, including
                 batch, interactive, stream, and graph computation, as
                 well as large-scale machine learning on heterogeneous
                 cloud-scale data centers. Several big data products,
                 such as rule-based engine, recommendation system, BI
                 tools, etc., are provided to address different business
                 needs. The platform not only supports Alibaba's
                 internal businesses but also provides solid services to
                 enterprise customers. In addition, I'll describe key
                 techniques and system internals, and outline
                 outstanding research and engineering challenges.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Cherupalli:2017:DAS,
  author =       "Hari Cherupalli and Henry Duwe and Weidong Ye and
                 Rakesh Kumar and John Sartori",
  title =        "Determining Application-specific Peak Power and Energy
                 Requirements for Ultra-low Power Processors",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "3--16",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037711",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many emerging applications such as IoT, wearables,
                 implantables, and sensor networks are power- and
                 energy-constrained. These applications rely on
                 ultra-low-power processors that have rapidly become the
                 most abundant type of processor manufactured today. In
                 the ultra-low-power embedded systems used by these
                 applications, peak power and energy requirements are
                 the primary factors that determine critical system
                 characteristics, such as size, weight, cost, and
                 lifetime. While the power and energy requirements of
                 these systems tend to be application-specific,
                 conventional techniques for rating peak power and
                 energy cannot accurately bound the power and energy
                 requirements of an application running on a processor,
                 leading to over-provisioning that increases system size
                 and weight. In this paper, we present an automated
                 technique that performs hardware-software co-analysis
                 of the application and ultra-low-power processor in an
                 embedded system to determine application-specific peak
                 power and energy requirements. Our technique provides
                 more accurate, tighter bounds than conventional
                 techniques for determining peak power and energy
                 requirements, reporting 15\% lower peak power and 17\%
                 lower peak energy, on average, than a conventional
                 approach based on profiling and guardbanding. Compared
                 to an aggressive stressmark-based approach, our
                 technique reports power and energy bounds that are 26\%
                 and 26\% lower, respectively, on average. Also, unlike
                 conventional approaches, our technique reports
                 guaranteed bounds on peak power and energy independent
                 of an application's input set. Tighter bounds on peak
                 power and energy can be exploited to reduce system
                 size, weight, and cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Chen:2017:PPQ,
  author =       "Quan Chen and Hailong Yang and Minyi Guo and Ram
                 Srivatsa Kannan and Jason Mars and Lingjia Tang",
  title =        "{Prophet}: Precise {QoS} Prediction on Non-Preemptive
                 Accelerators to Improve Utilization in Warehouse-Scale
                 Computers",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "17--32",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037700",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Guaranteeing Quality-of-Service (QoS) of
                 latency-sensitive applications while improving server
                 utilization through application co-location is
                 important yet challenging in modern datacenters. The
                 key challenge is that when applications are co-located
                 on a server, performance interference due to resource
                 contention can be detrimental to the application QoS.
                 Although prior work has proposed techniques to identify
                 ``safe'' co-locations where application QoS is
                 satisfied by predicting the performance interference on
                 multicores, no such prediction technique on
                 accelerators such as GPUs. In this work, we present
                 Prophet, an approach to precisely predict the
                 performance degradation of latency-sensitive
                 applications on accelerators due to application
                 co-location. We analyzed the performance interference
                 on accelerators through a real system investigation and
                 found that unlike on multicores where the key
                 contentious resources are shared caches and main memory
                 bandwidth, the key contentious resources on
                 accelerators are instead processing elements,
                 accelerator memory bandwidth and PCIe bandwidth. Based
                 on this observation, we designed interference models
                 that enable the precise prediction for processing
                 element, accelerator memory bandwidth and PCIe
                 bandwidth contention on real hardware. By using a novel
                 technique to forecast solo-run execution traces of the
                 co-located applications using interference models,
                 Prophet can accurately predict the performance
                 degradation of latency-sensitive applications on
                 non-preemptive accelerators. Using Prophet, we can
                 identify ``safe'' co-locations on accelerators to
                 improve utilization without violating the QoS target.
                 Our evaluation shows that Prophet can predict the
                 performance degradation with an average prediction
                 error 5.47\% on real systems. Meanwhile, based on the
                 prediction, Prophet achieves accelerator utilization
                 improvements of 49.9\% on average while maintaining the
                 QoS target of latency-sensitive applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Kanev:2017:MAM,
  author =       "Svilen Kanev and Sam Likun Xi and Gu-Yeon Wei and
                 David Brooks",
  title =        "{Mallacc}: Accelerating Memory Allocation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "33--45",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037736",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent work shows that dynamic memory allocation
                 consumes nearly 7\% of all cycles in Google
                 datacenters. With the trend towards increased
                 specialization of hardware, we propose Mallacc, an
                 in-core hardware accelerator designed for broad use
                 across a number of high-performance, modern memory
                 allocators. The design of Mallacc is quite different
                 from traditional throughput-oriented hardware
                 accelerators. Because memory allocation requests tend
                 to be very frequent, fast, and interspersed inside
                 other application code, accelerators must be optimized
                 for latency rather than throughput and area overheads
                 must be kept to a bare minimum. Mallacc accelerates the
                 three primary operations of a typical memory allocation
                 request: size class computation, retrieval of a free
                 memory block, and sampling of memory usage. Our results
                 show that malloc latency can be reduced by up to 50\%
                 with a hardware cost of less than 1500 um2 of silicon
                 area, less than 0.006\% of a typical high-performance
                 processor core.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Wen:2017:REV,
  author =       "Shasha Wen and Milind Chabbi and Xu Liu",
  title =        "{REDSPY}: Exploring Value Locality in Software",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "47--61",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037729",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Complex code bases with several layers of abstractions
                 have abundant inefficiencies that affect the execution
                 time. Value redundancy is a kind of inefficiency where
                 the same values are repeatedly computed, stored, or
                 retrieved over the course of execution. Not all
                 redundancies can be easily detected or eliminated with
                 compiler optimization passes due to the inherent
                 limitations of the static analysis. Microscopic
                 observation of whole executions at instruction- and
                 operand-level granularity breaks down abstractions and
                 helps recognize redundancies that masquerade in complex
                 programs. We have developed REDSPY---a fine-grained
                 profiler to pinpoint and quantify redundant operations
                 in program executions. Value redundancy may happen over
                 time at same locations or in adjacent locations, and
                 thus it has temporal and spatial locality. REDSPY
                 identifies both temporal and spatial value locality.
                 Furthermore, REDSPY is capable of identifying values
                 that are approximately the same, enabling optimization
                 opportunities in HPC codes that often use floating
                 point computations. REDSPY provides intuitive
                 optimization guidance by apportioning redundancies to
                 their provenance---source lines and execution calling
                 contexts. REDSPY pinpointed dramatically high volume of
                 redundancies in programs that were optimization targets
                 for decades, such as SPEC CPU2006 suite, Rodinia
                 benchmark, and NWChem---a production computational
                 chemistry code. Guided by REDSPY, we were able to
                 eliminate redundancies that resulted in significant
                 speedups.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Bhattacharjee:2017:TTP,
  author =       "Abhishek Bhattacharjee",
  title =        "Translation-Triggered Prefetching",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "63--76",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037705",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose translation-enabled memory prefetching
                 optimizations or TEMPO, a low-overhead hardware
                 mechanism to boost memory performance by exploiting the
                 operating system's (OS) virtual memory subsystem. We
                 are the first to make the following observations: (1) a
                 substantial fraction (20-40\%) of DRAM references in
                 modern big-data workloads are devoted to accessing page
                 tables; and (2) when memory references require page
                 table lookups in DRAM, the vast majority of them
                 (98\%+) also look up DRAM for the subsequent data
                 access. TEMPO exploits these observations to enable
                 DRAM row-buffer and on-chip cache prefetching of the
                 data that page tables point to. TEMPO requires trivial
                 changes to the memory controller (under 3\% additional
                 area), no OS or application changes, and improves
                 performance by 10-30\% and energy by 1-14\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Kim:2017:TAA,
  author =       "Channoh Kim and Jaehyeok Kim and Sungmin Kim and
                 Dooyoung Kim and Namho Kim and Gitae Na and Young H. Oh
                 and Hyeon Gyu Cho and Jae W. Lee",
  title =        "Typed Architectures: Architectural Support for
                 Lightweight Scripting",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "77--90",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037726",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic scripting languages are becoming more and more
                 widely adopted not only for fast prototyping but also
                 for developing production-grade applications. They
                 provide high-productivity programming environments
                 featuring high levels of abstraction with powerful
                 built-in functions, automatic memory management,
                 object-oriented programming paradigm and dynamic
                 typing. However, their flexible, dynamic type systems
                 easily become the source of inefficiency in terms of
                 instruction count, memory footprint, and energy
                 consumption. This overhead makes it challenging to
                 deploy these high-productivity programming technologies
                 on emerging single-board computers for IoT
                 applications. Addressing this challenge, this paper
                 introduces Typed Architectures, a high-efficiency,
                 low-cost execution substrate for dynamic scripting
                 languages, where each data variable retains high-level
                 type information at an ISA level. Typed Architectures
                 calculate and check the dynamic type of each variable
                 implicitly in hardware, rather than explicitly in
                 software, hence significantly reducing instruction
                 count for dynamic type checking. Besides, Typed
                 Architectures introduce polymorphic instructions (e.g.,
                 xadd), which are bound to the correct native
                 instruction at runtime within the pipeline (e.g., add
                 or fadd) to efficiently implement polymorphic
                 operators. Finally, Typed Architectures provide
                 hardware support for flexible yet efficient type tag
                 extraction and insertion, capturing common data layout
                 patterns of tag-value pairs. Our evaluation using a
                 fully synthesizable RISC-V RTL design on FPGA shows
                 that Typed Architectures achieve geomean speedups of
                 11.2\% and 9.9\% with maximum speedups of 32.6\% and
                 43.5\% for two production-grade scripting engines for
                 JavaScript and Lua, respectively. Moreover, Typed
                 Architectures improve the energy-delay product (EDP) by
                 19.3\% for JavaScript and 16.5\% for Lua with an area
                 overhead of 1.6\% at a 40nm technology node.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Seo:2017:FAS,
  author =       "Jihye Seo and Wook-Hee Kim and Woongki Baek and
                 Beomseok Nam and Sam H. Noh",
  title =        "Failure-Atomic Slotted Paging for Persistent Memory",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "91--104",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037737",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The slotted-page structure is a database page format
                 commonly used for managing variable-length records. In
                 this work, we develop a novel ``failure-atomic slotted
                 page structure'' for persistent memory that leverages
                 byte addressability and durability of persistent memory
                 to minimize redundant write operations used to maintain
                 consistency in traditional database systems.
                 Failure-atomic slotted paging consists of two key
                 elements: (i) in-place commit per page using hardware
                 transactional memory and (ii) slot header logging that
                 logs the commit mark of each page. The proposed scheme
                 is implemented in SQLite and compared against NVWAL,
                 the current state-of-the-art scheme. Our performance
                 study shows that our failure-atomic slotted paging
                 shows optimal performance for database transactions
                 that insert a single record. For transactions that
                 touch more than one database page, our proposed
                 slot-header logging scheme minimizes the logging
                 overhead by avoiding duplicating pages and logging only
                 the metadata of the dirty pages. Overall, we find that
                 our failure-atomic slotted-page management scheme
                 reduces database logging overhead to 1/6 and improves
                 query response time by up to 33\% compared to NVWAL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Nguyen:2017:WSP,
  author =       "Donald Nguyen and Keshav Pingali",
  title =        "What Scalable Programs Need from Transactional
                 Memory",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "105--118",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037750",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactional memory (TM) has been the focus of
                 numerous studies, and it is supported in processors
                 such as the IBM Blue Gene/Q and Intel Haswell. Many
                 studies have used the STAMP benchmark suite to evaluate
                 their designs. However, the speedups obtained for the
                 STAMP benchmarks on all TM systems we know of are quite
                 limited; for example, with 64 threads on the IBM Blue
                 Gene/Q, we observe a median speedup of 1.4X using the
                 Blue Gene/Q hardware transactional memory (HTM), and a
                 median speedup of 4.1X using a software transactional
                 memory (STM). What limits the performance of these
                 benchmarks on TMs? In this paper, we argue that the
                 problem lies with the programming model and data
                 structures used to write them. To make this point, we
                 articulate two principles that we believe must be
                 embodied in any scalable program and argue that STAMP
                 programs violate both of them. By modifying the STAMP
                 programs to satisfy both principles, we produce a new
                 set of programs that we call the Stampede suite. Its
                 median speedup on the Blue Gene/Q is 8.0X when using an
                 STM. The two principles also permit us to simplify the
                 TM design. Using this new STM with the Stampede
                 benchmarks, we obtain a median speedup of 17.7X with 64
                 threads on the Blue Gene/Q and 13.2X with 32 threads on
                 an Intel Westmere system. These results suggest that
                 HTM and STM designs will benefit if more attention is
                 paid to the division of labor between application
                 programs, systems software, and hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Trippel:2017:TMM,
  author =       "Caroline Trippel and Yatin A. Manerkar and Daniel
                 Lustig and Michael Pellauer and Margaret Martonosi",
  title =        "{TriCheck}: Memory Model Verification at the
                 Trisection of Software, Hardware, and {ISA}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "119--133",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037719",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Memory consistency models (MCMs) which govern
                 inter-module interactions in a shared memory system,
                 are a significant, yet often under-appreciated, aspect
                 of system design. MCMs are defined at the various
                 layers of the hardware-software stack, requiring
                 thoroughly verified specifications, compilers, and
                 implementations at the interfaces between layers.
                 Current verification techniques evaluate segments of
                 the system stack in isolation, such as proving compiler
                 mappings from a high-level language (HLL) to an ISA or
                 proving validity of a microarchitectural implementation
                 of an ISA. This paper makes a case for full-stack MCM
                 verification and provides a toolflow, TriCheck, capable
                 of verifying that the HLL, compiler, ISA, and
                 implementation collectively uphold MCM requirements.
                 The work showcases TriCheck's ability to evaluate a
                 proposed ISA MCM in order to ensure that each layer and
                 each mapping is correct and complete. Specifically, we
                 apply TriCheck to the open source RISC-V ISA [55],
                 seeking to verify accurate, efficient, and legal
                 compilations from C11. We uncover under-specifications
                 and potential inefficiencies in the current RISC-V ISA
                 documentation and identify possible solutions for each.
                 As an example, we find that a RISC-V-compliant
                 microarchitecture allows 144 outcomes forbidden by C11
                 to be observed out of 1,701 litmus tests examined.
                 Overall, this paper demonstrates the necessity of
                 full-stack verification for detecting MCM-related bugs
                 in the hardware-software stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Nalli:2017:APM,
  author =       "Sanketh Nalli and Swapnil Haria and Mark D. Hill and
                 Michael M. Swift and Haris Volos and Kimberly Keeton",
  title =        "An Analysis of Persistent Memory Use with {WHISPER}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "135--148",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037730",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging non-volatile memory (NVM) technologies
                 promise durability with read and write latencies
                 comparable to volatile memory (DRAM). We define
                 Persistent Memory (PM) as NVM accessed with byte
                 addressability at low latency via normal memory
                 instructions. Persistent-memory applications ensure the
                 consistency of persistent data by inserting ordering
                 points between writes to PM allowing the construction
                 of higher-level transaction mechanisms. An epoch is a
                 set of writes to PM between ordering points. To put
                 systems research in PM on a firmer footing, we
                 developed and analyzed a PM benchmark suite called
                 WHISPER (Wisconsin-HP Labs Suite for Persistence) that
                 comprises ten PM applications we gathered to cover all
                 current interfaces to PM. A quantitative analysis
                 reveals several insights: (a) only 4\% of writes in
                 PM-aware applications are to PM and the rest are to
                 volatile memory, (b) software transactions are often
                 implemented with 5 to 50 ordering points (c) 75\% of
                 epochs update exactly one 64B cache line, (d) 80\% of
                 epochs from the same thread depend on previous epochs
                 from the same thread, while few epochs depend on epochs
                 from other threads. Based on our analysis, we propose
                 the Hands-off Persistence System (HOPS) to track
                 updates to PM in hardware. Current hardware design
                 requires applications to force data to PM as each epoch
                 ends. HOPS provides high-level ISA primitives for
                 applications to express durability and ordering
                 constraints separately and enforces them automatically,
                 while achieving 24.3\% better performance over current
                 approaches to persistence.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Zhang:2017:PPD,
  author =       "Tong Zhang and Changhee Jung and Dongyoon Lee",
  title =        "{ProRace}: Practical Data Race Detection for
                 Production Use",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "149--162",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037708",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents ProRace, a dynamic data race
                 detector practical for production runs. It is
                 lightweight, but still offers high race detection
                 capability. To track memory accesses, ProRace leverages
                 instruction sampling using the performance monitoring
                 unit (PMU) in commodity processors. Our PMU driver
                 enables ProRace to sample more memory accesses at a
                 lower cost compared to the state-of-the-art Linux
                 driver. Moreover, ProRace uses PMU-provided execution
                 contexts including register states and program path,
                 and reconstructs unsampled memory accesses offline.
                 This technique allows \ProRace to overcome inherent
                 limitations of sampling and improve the detection
                 coverage by performing data race detection on the trace
                 with not only sampled but also reconstructed memory
                 accesses. Experiments using racy production software
                 including apache and mysql shows that, with a
                 reasonable offline cost, ProRace incurs only 2.6\%
                 overhead at runtime with 27.5\% detection probability
                 with a sampling period of 10,000.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Olson:2017:CGM,
  author =       "Lena E. Olson and Mark D. Hill and David A. Wood",
  title =        "Crossing Guard: Mediating Host-Accelerator Coherence
                 Interactions",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "163--176",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037715",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Specialized hardware accelerators have performance and
                 energy-efficiency advantages over general-purpose
                 processors. To fully realize these benefits and aid
                 programmability, accelerators may share a physical and
                 virtual address space and full cache coherence with the
                 host system. However, allowing accelerators ---
                 particularly those designed by third parties --- to
                 directly communicate with host coherence protocols
                 poses several problems. Host coherence protocols are
                 complex, vary between companies, and may be
                 proprietary, increasing burden on accelerator
                 designers. Bugs in the accelerator implementation may
                 cause crashes and other serious consequences to the
                 host system. We propose Crossing Guard, a coherence
                 interface between the host coherence system and
                 accelerators. The Crossing Guard interface provides the
                 accelerator designer with a standardized set of
                 coherence messages that are simple enough to aid in
                 design of bug-free coherent caches. At the same time,
                 they are sufficiently complex to allow customized and
                 optimized accelerator caches with performance
                 comparable to using the host protocol. The Crossing
                 Guard hardware is implemented as part of the trusted
                 host, and provides complete safety to the host
                 coherence system, even in the presence of a
                 pathologically buggy accelerator cache.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{McMahan:2017:ASF,
  author =       "Joseph McMahan and Michael Christensen and Lawton
                 Nichols and Jared Roesch and Sung-Yee Guo and Ben
                 Hardekopf and Timothy Sherwood",
  title =        "An Architecture Supporting Formal and Compositional
                 Binary Analysis",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "177--191",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037733",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Building a trustworthy life-critical embedded system
                 requires deep reasoning about the potential effects
                 that sequences of machine instructions can have on full
                 system operation. Rather than trying to analyze
                 complete binaries and the countless ways their
                 instructions can interact with one another --- memory,
                 side effects, control registers, implicit state, etc.
                 --- we explore a new approach. We propose an
                 architecture controlled by a thin computational layer
                 designed to tightly correspond with the lambda
                 calculus, drawing on principles of functional
                 programming to bring the assembly much closer to myriad
                 reasoning frameworks, such as the Coq proof assistant.
                 This approach allows assembly-level verified versions
                 of critical code to operate safely in tandem with
                 arbitrary code, including imperative and unverified
                 system components, without the need for large
                 supporting trusted computing bases. We demonstrate that
                 this computational layer can be built in such a way as
                 to simultaneously provide full programmability and
                 compact, precise, and complete semantics, while still
                 using hardware resources comparable to normal embedded
                 systems. To demonstrate the practicality of this
                 approach, our FPGA-implemented prototype runs an
                 embedded medical application which monitors and treats
                 life-threatening arrhythmias. Though the system
                 integrates untrusted and imperative components, our
                 architecture allows for the formal verification of
                 multiple properties of the end-to-end system, including
                 a proof of correctness of the assembly-level
                 implementation of the core algorithm, the integrity of
                 trusted data via a non-interference proof, and a
                 guarantee that our prototype meets critical timing
                 requirements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Hsiao:2017:ASI,
  author =       "Chun-Hung Hsiao and Satish Narayanasamy and Essam
                 Muhammad Idris Khan and Cristiano L. Pereira and Gilles
                 A. Pokam",
  title =        "{AsyncClock}: Scalable Inference of Asynchronous Event
                 Causality",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "193--205",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037712",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Asynchronous programming model is commonly used in
                 mobile systems and Web 2.0 environments. Asynchronous
                 race detectors use algorithms that are an order of
                 magnitude performance and space inefficient compared to
                 conventional data race detectors. We solve this problem
                 by identifying and addressing two important problems in
                 reasoning about causality between asynchronous events.
                 Unlike conventional signal-wait operations,
                 establishing causal order between two asynchronous
                 events is fundamentally more challenging as there is no
                 common handle they operate on. We propose a new
                 primitive named AsyncClock that addresses this problem
                 by explicitly tracking causally preceding events, and
                 show that AsyncClock can handle a wide variety of
                 asynchronous causality models. We also address the
                 important scalability problem of efficiently
                 identifying heirless events whose metadata can be
                 reclaimed. We built the first single-pass,
                 non-graph-based Android race detector using our
                 algorithm and applied it to find errors in 20 popular
                 applications. Our tool incurs about 6x performance
                 overhead, which is several times more efficient than
                 the state-of-the-art solution. It also scales well with
                 the execution length. We used our tool to find 147
                 previously unknown harmful races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Calciu:2017:BBC,
  author =       "Irina Calciu and Siddhartha Sen and Mahesh
                 Balakrishnan and Marcos K. Aguilera",
  title =        "Black-box Concurrent Data Structures for {NUMA}
                 Architectures",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "207--221",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037721",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "High-performance servers are Non-Uniform Memory Access
                 (NUMA) machines. To fully leverage these machines,
                 programmers need efficient concurrent data structures
                 that are aware of the NUMA performance artifacts. We
                 propose Node Replication (NR), a black-box approach to
                 obtaining such data structures. NR takes an arbitrary
                 sequential data structure and automatically transforms
                 it into a NUMA-aware concurrent data structure
                 satisfying linearizability. Using NR requires no
                 expertise in concurrent data structure design, and the
                 result is free of concurrency bugs. NR draws ideas from
                 two disciplines: shared-memory algorithms and
                 distributed systems. Briefly, NR implements a
                 NUMA-aware shared log, and then uses the log to
                 replicate data structures consistently across NUMA
                 nodes. NR is best suited for contended data structures,
                 where it can outperform lock-free algorithms by 3.1x,
                 and lock-based solutions by 30x. To show the benefits
                 of NR to a real application, we apply NR to the data
                 structures of Redis, an in-memory storage system. The
                 result outperforms other methods by up to 14x. The cost
                 of NR is additional memory for its log and replicas.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Vora:2017:CCR,
  author =       "Keval Vora and Chen Tian and Rajiv Gupta and Ziang
                 Hu",
  title =        "{CoRAL}: Confined Recovery in Distributed Asynchronous
                 Graph Processing",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "223--236",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037747",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Existing distributed asynchronous graph processing
                 systems employ checkpointing to capture globally
                 consistent snapshots and rollback all machines to most
                 recent checkpoint to recover from machine failures. In
                 this paper we argue that recovery in distributed
                 asynchronous graph processing does not require the
                 entire execution state to be rolled back to a globally
                 consistent state due to the relaxed asynchronous
                 execution semantics. We define the properties required
                 in the recovered state for it to be usable for correct
                 asynchronous processing and develop CoRAL, a
                 lightweight checkpointing and recovery algorithm.
                 First, this algorithm carries out confined recovery
                 that only rolls back graph execution states of the
                 failed machines to affect recovery. Second, it relies
                 upon lightweight checkpoints that capture locally
                 consistent snapshots with a reduced peak network
                 bandwidth requirement. Our experiments using real-world
                 graphs show that our technique recovers from failures
                 and finishes processing 1.5x to 3.2x faster compared to
                 the traditional asynchronous checkpointing and recovery
                 mechanism when failures impact 1 to 6 machines of a 16
                 machine cluster. Moreover, capturing locally consistent
                 snapshots significantly reduces intermittent high peak
                 bandwidth usage required to save the snapshots --- the
                 average reduction in 99th percentile bandwidth ranges
                 from 22\% to 51\% while 1 to 6 snapshot replicas are
                 being maintained.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Vora:2017:KFA,
  author =       "Keval Vora and Rajiv Gupta and Guoqing Xu",
  title =        "{KickStarter}: Fast and Accurate Computations on
                 Streaming Graphs via Trimmed Approximations",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "237--251",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037748",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Continuous processing of a streaming graph maintains
                 an approximate result of the iterative computation on a
                 recent version of the graph. Upon a user query, the
                 accurate result on the current graph can be quickly
                 computed by feeding the approximate results to the
                 iterative computation --- a form of incremental
                 computation that corrects the (small amount of) error
                 in the approximate result. Despite the effectiveness of
                 this approach in processing growing graphs, it is
                 generally not applicable when edge deletions are
                 present --- existing approximations can lead to either
                 incorrect results (e.g., monotonic computations
                 terminate at an incorrect minima/maxima) or poor
                 performance (e.g., with approximations, convergence
                 takes longer than performing the computation from
                 scratch). This paper presents KickStarter, a runtime
                 technique that can trim the approximate values for a
                 subset of vertices impacted by the deleted edges. The
                 trimmed approximation is both safe and profitable,
                 enabling the computation to produce correct results and
                 converge quickly. KickStarter works for a class of
                 monotonic graph algorithms and can be readily
                 incorporated in any existing streaming graph system.
                 Our experiments with four streaming algorithms on five
                 large graphs demonstrate that trimming not only
                 produces correct results but also accelerates these
                 algorithms by 8.5--23.7x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Powers:2017:BBG,
  author =       "Bobby Powers and John Vilk and Emery D. Berger",
  title =        "{Browsix}: Bridging the Gap Between {Unix} and the
                 Browser",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "253--266",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037727",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  abstract =     "Applications written to run on conventional operating
                 systems typically depend on OS abstractions like
                 processes, pipes, signals, sockets, and a shared file
                 system. Porting these applications to the web currently
                 requires extensive rewriting or hosting significant
                 portions of code server-side because browsers present a
                 nontraditional runtime environment that lacks OS
                 functionality. This paper presents Browsix, a framework
                 that bridges the considerable gap between conventional
                 operating systems and the browser, enabling unmodified
                 programs expecting a Unix-like environment to run
                 directly in the browser. Browsix comprises two core
                 parts: (1) a JavaScript-only system that makes core
                 Unix features (including pipes, concurrent processes,
                 signals, sockets, and a shared file system) available
                 to web applications; and (2) extended JavaScript
                 runtimes for C, C++, Go, and Node.js that support
                 running programs written in these languages as
                 processes in the browser. Browsix supports running a
                 POSIX shell, making it straightforward to connect
                 applications together via pipes. We illustrate
                 Browsix's capabilities via case studies that
                 demonstrate how it eases porting legacy applications to
                 the browser and enables new functionality. We
                 demonstrate a Browsix-enabled LaTeX editor that
                 operates by executing unmodified versions of pdfLaTeX
                 and BibTeX. This browser-only LaTeX editor can render
                 documents in seconds, making it fast enough to be
                 practical. We further demonstrate how Browsix lets us
                 port a client-server application to run entirely in the
                 browser for disconnected operation. Creating these
                 applications required less than 50 lines of glue code
                 and no code modifications, demonstrating how easily
                 Browsix can be used to build sophisticated web
                 applications from existing parts without
                 modification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Rajbhandari:2017:OCM,
  author =       "Samyam Rajbhandari and Yuxiong He and Olatunji Ruwase
                 and Michael Carbin and Trishul Chilimbi",
  title =        "Optimizing {CNNs} on Multicores for Scalability,
                 Performance and Goodput",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "267--280",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037745",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Convolutional Neural Networks (CNN) are a class of
                 Artificial Neural Networks (ANN) that are highly
                 efficient at the pattern recognition tasks that
                 underlie difficult AI problems in a variety of domains,
                 such as speech recognition, object recognition, and
                 natural language processing. CNNs are, however,
                 computationally intensive to train. This paper presents
                 the first characterization of the performance
                 optimization opportunities for training CNNs on CPUs.
                 Our characterization includes insights based on the
                 structure of the network itself (i.e., intrinsic
                 arithmetic intensity of the convolution and its
                 scalability under parallelism) as well as dynamic
                 properties of its execution (i.e., sparsity of the
                 computation). Given this characterization, we present
                 an automatic framework called spg-CNN for optimizing
                 CNN training on CPUs. It comprises of a computation
                 scheduler for efficient parallel execution, and two
                 code generators: one that optimizes for sparsity, and
                 the other that optimizes for spatial reuse in
                 convolutions. We evaluate spg-CNN using convolutions
                 from a variety of real world benchmarks, and show that
                 spg-CNN can train CNNs faster than state-of-the-art
                 approaches by an order of magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Sundararajah:2017:LTN,
  author =       "Kirshanthan Sundararajah and Laith Sakka and Milind
                 Kulkarni",
  title =        "Locality Transformations for Nested Recursive
                 Iteration Spaces",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "281--295",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037720",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There has been a significant amount of effort invested
                 in designing scheduling transformations such as loop
                 tiling and loop fusion that rearrange the execution of
                 dynamic instances of loop nests to place operations
                 that access the same data close together temporally. In
                 recent years, there has been interest in designing
                 similar transformations that operate on recursive
                 programs, but until now these transformations have only
                 considered simple scenarios: multiple recursions to be
                 fused, or a recursion nested inside a simple loop. This
                 paper develops the first set of scheduling
                 transformations for nested recursions: recursive
                 methods that call other recursive methods. These are
                 the recursive analog to nested loops. We present a
                 transformation called recursion twisting that
                 automatically improves locality at all levels of the
                 memory hierarchy, and show that this transformation can
                 yield substantial performance improvements across
                 several benchmarks that exhibit nested recursion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Li:2017:LAC,
  author =       "Ang Li and Shuaiwen Leon Song and Weifeng Liu and Xu
                 Liu and Akash Kumar and Henk Corporaal",
  title =        "Locality-Aware {CTA} Clustering for Modern {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "297--311",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037709",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cache is designed to exploit locality; however, the
                 role of on-chip L1 data caches on modern GPUs is often
                 awkward. The locality among global memory requests from
                 different SMs (Streaming Multiprocessors) is
                 predominantly harvested by the commonly-shared L2 with
                 long access latency; while the in-core locality, which
                 is crucial for performance delivery, is handled
                 explicitly by user-controlled scratchpad memory. In
                 this work, we disclose another type of data locality
                 that has been long ignored but with performance
                 boosting potential --- the inter-CTA locality.
                 Exploiting such locality is rather challenging due to
                 unclear hardware feasibility, unknown and inaccessible
                 underlying CTA scheduler, and small in-core cache
                 capacity. To address these issues, we first conduct a
                 thorough empirical exploration on various modern GPUs
                 and demonstrate that inter-CTA locality can be
                 harvested, both spatially and temporally, on L1 or
                 L1/Tex unified cache. Through further quantification
                 process, we prove the significance and commonality of
                 such locality among GPU applications, and discuss
                 whether such reuse is exploitable. By leveraging these
                 insights, we propose the concept of CTA-Clustering and
                 its associated software-based techniques to reshape the
                 default CTA scheduling in order to group the CTAs with
                 potential reuse together on the same SM. Our techniques
                 require no hardware modification and can be directly
                 deployed on existing GPUs. In addition, we incorporate
                 these techniques into an integrated framework for
                 automatic inter-CTA locality optimization. We evaluate
                 our techniques using a wide range of popular GPU
                 applications on all modern generations of NVIDIA GPU
                 architectures. The results show that our proposed
                 techniques significantly improve cache performance
                 through reducing L2 cache transactions by 55\%, 65\%,
                 29\%, 28\% on average for Fermi, Kepler, Maxwell and
                 Pascal, respectively, leading to an average of 1.46x,
                 1.48x, 1.45x, 1.41x (up to 3.8x, 3.6x, 3.1x, 3.3x)
                 performance speedups for applications with
                 algorithm-related inter-CTA reuse.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Churchill:2017:SLS,
  author =       "Berkeley Churchill and Rahul Sharma and J. F. Bastien
                 and Alex Aiken",
  title =        "Sound Loop Superoptimization for {Google Native
                 Client}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "313--326",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software fault isolation (SFI) is an important
                 technique for the construction of secure operating
                 systems, web browsers, and other extensible software.
                 We demonstrate that superoptimization can dramatically
                 improve the performance of Google Native Client, a SFI
                 system that ships inside the Google Chrome Browser. Key
                 to our results are new techniques for superoptimization
                 of loops: we propose a new architecture for
                 superoptimization tools that incorporates both a fully
                 sound verification technique to ensure correctness and
                 a bounded verification technique to guide the search to
                 optimized code. In our evaluation we optimize 13 libc
                 string functions, formally verify the correctness of
                 the optimizations and report a median and average
                 speedup of 25\% over the libraries shipped by Google.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Bianchini:2017:IDE,
  author =       "Ricardo Bianchini",
  title =        "Improving Datacenter Efficiency",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "327--327",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3046426",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Internet companies can improve datacenter efficiency
                 and reduce costs, by minimizing resource waste while
                 avoiding (or limiting) performance degradation. In this
                 talk, I will first overview a few of the
                 efficiency-related efforts we are undertaking at
                 Microsoft, including leveraging workload history to
                 improve resource management. I will then discuss some
                 lessons from deploying these efforts in production and
                 how they relate to academic research.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Liu:2017:DBD,
  author =       "Mengxing Liu and Mingxing Zhang and Kang Chen and
                 Xuehai Qian and Yongwei Wu and Weimin Zheng and Jinglei
                 Ren",
  title =        "{DudeTM}: Building Durable Transactions with
                 Decoupling for Persistent Memory",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "329--343",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037714",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging non-volatile memory (NVM) offers
                 non-volatility, byte-addressability and fast access at
                 the same time. To make the best use of these
                 properties, it has been shown by empirical evidence
                 that programs should access NVM directly through CPU
                 load and store instructions, so that the overhead of a
                 traditional file system or database can be avoided.
                 Thus, durable transactions become a common choice of
                 applications for accessing persistent memory data in a
                 crash consistent manner. However, existing durable
                 transaction systems employ either undo logging, which
                 requires a fence for every memory write, or redo
                 logging, which requires intercepting all memory reads
                 within transactions. This paper presents DUDETM, a
                 crash-consistent durable transaction system that avoids
                 the drawbacks of both undo logging and redo logging.
                 DUDETM uses shadow DRAM to decouple the execution of a
                 durable transaction into three fully asynchronous
                 steps. The advantage is that only minimal fences and no
                 memory read instrumentation are required. This design
                 also enables an out-of-the-box transactional memory
                 (TM) to be used as an independent component in our
                 system. The evaluation results show that DUDETM adds
                 durability to a TM system with only 7.4 ~ 24.6\%
                 throughput degradation. Compared to the existing
                 durable transaction systems, DUDETM provides 1.7times
                 to 4.4times higher throughput. Moreover, DUDETM can be
                 implemented with existing hardware TMs with minor
                 hardware modifications, leading to a further 1.7times
                 speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Klimovic:2017:RRF,
  author =       "Ana Klimovic and Heiner Litz and Christos Kozyrakis",
  title =        "{ReFlex}: Remote Flash $ \approx $ Local Flash",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "345--359",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037732",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Remote access to NVMe Flash enables flexible scaling
                 and high utilization of Flash capacity and IOPS within
                 a datacenter. However, existing systems for remote
                 Flash access either introduce significant performance
                 overheads or fail to isolate the multiple remote
                 clients sharing each Flash device. We present ReFlex, a
                 software-based system for remote Flash access, that
                 provides nearly identical performance to accessing
                 local Flash. ReFlex uses a dataplane kernel to closely
                 integrate networking and storage processing to achieve
                 low latency and high throughput at low resource
                 requirements. Specifically, ReFlex can serve up to 850K
                 IOPS per core over TCP/IP networking, while adding 21us
                 over direct access to local Flash. ReFlex uses a QoS
                 scheduler that can enforce tail latency and throughput
                 service-level objectives (SLOs) for thousands of remote
                 clients. We show that ReFlex allows applications to use
                 remote Flash while maintaining their original
                 performance with local Flash.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Jevdjic:2017:ASC,
  author =       "Djordje Jevdjic and Karin Strauss and Luis Ceze and
                 Henrique S. Malvar",
  title =        "Approximate Storage of Compressed and Encrypted
                 Videos",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "361--373",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037718",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The popularization of video capture devices has
                 created strong storage demand for encoded videos.
                 Approximate storage can ease this demand by enabling
                 denser storage at the expense of occasional errors.
                 Unfortunately, even minor storage errors, such as bit
                 flips, can result in major visual damage in encoded
                 videos. Similarly, video encryption, widely employed
                 for privacy and digital rights management, may create
                 long dependencies between bits that show little or no
                 tolerance to storage errors. In this paper we propose
                 VideoApp, a novel and efficient methodology to compute
                 bit-level reliability requirements for encoded videos
                 by tracking visual and metadata dependencies within
                 encoded bitstreams. We further show how VideoApp can be
                 used to trade video quality for storage density in an
                 optimal way. We integrate our methodology into a
                 popular H.264 encoder to partition an encoded video
                 stream into multiple streams that can receive different
                 levels of error correction according to their
                 reliability needs. When applied to a dense and highly
                 error-prone multi-level cell storage substrate, our
                 variable error correction mechanism reduces the error
                 correction overhead by half under the most
                 error-intolerant encoder settings, achieving
                 quality/density points that neither compression nor
                 approximation can achieve alone. Finally, we define the
                 basic invariants needed to support encrypted
                 approximate video storage. We present an analysis of
                 block cipher modes of operation, showing that some are
                 fully compatible with approximation, enabling
                 approximate and secure video storage systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Elyasi:2017:EIR,
  author =       "Nima Elyasi and Mohammad Arjomand and Anand
                 Sivasubramaniam and Mahmut T. Kandemir and Chita R. Das
                 and Myoungsoo Jung",
  title =        "Exploiting Intra-Request Slack to Improve {SSD}
                 Performance",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "375--388",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037728",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With Solid State Disks (SSDs) offering high degrees of
                 parallelism, SSD controllers place data and direct
                 requests to exploit the maximum offered hardware
                 parallelism. In the quest to maximize parallelism and
                 utilization, sub-requests of a request that are
                 directed to different flash chips by the scheduler can
                 experience differential wait times since their
                 individual queues are not coordinated and load balanced
                 at all times. Since the macro request is considered
                 complete only when its last sub-request completes, some
                 of its sub-requests that complete earlier have to
                 necessarily wait for this last sub-request. This paper
                 opens the door to a new class of schedulers to leverage
                 such slack between sub-requests in order to improve
                 response times. Specifically, the paper presents the
                 design and implementation of a slack-enabled
                 re-ordering scheduler, called Slacker, for sub-requests
                 issued to each flash chip. Layered under a modern SSD
                 request scheduler, Slacker estimates the slack of each
                 incoming sub-request to a flash chip and allows them to
                 jump ahead of existing sub-requests with sufficient
                 slack so as to not detrimentally impact their response
                 times. Slacker is simple to implement and imposes only
                 marginal additions to the hardware. Using a spectrum of
                 21 workloads with diverse read-write characteristics,
                 we show that Slacker provides as much as 19.5\%, 13\%
                 and 14.5\% improvement in response times, with average
                 improvements of 12\%, 6.5\% and 8.5\%, for
                 write-intensive, read-intensive and read-write balanced
                 workloads, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Wang:2017:GSM,
  author =       "Kai Wang and Aftab Hussain and Zhiqiang Zuo and
                 Guoqing Xu and Ardalan Amiri Sani",
  title =        "{Graspan}: a Single-machine Disk-based Graph System
                 for Interprocedural Static Analyses of Large-scale
                 Systems Code",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "389--404",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037744",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "There is more than a decade-long history of using
                 static analysis to find bugs in systems such as Linux.
                 Most of the existing static analyses developed for
                 these systems are simple checkers that find bugs based
                 on pattern matching. Despite the presence of many
                 sophisticated interprocedural analyses, few of them
                 have been employed to improve checkers for systems code
                 due to their complex implementations and poor
                 scalability. In this paper, we revisit the scalability
                 problem of interprocedural static analysis from a ``Big
                 Data'' perspective. That is, we turn sophisticated code
                 analysis into Big Data analytics and leverage novel
                 data processing techniques to solve this traditional
                 programming language problem. We develop Graspan, a
                 disk-based parallel graph system that uses an edge-pair
                 centric computation model to compute dynamic transitive
                 closures on very large program graphs. We implement
                 context-sensitive pointer/alias and dataflow analyses
                 on Graspan. An evaluation of these analyses on large
                 codebases such as Linux shows that their Graspan
                 implementations scale to millions of lines of code and
                 are much simpler than their original implementations.
                 Moreover, we show that these analyses can be used to
                 augment the existing checkers; these augmented checkers
                 uncovered 132 new NULL pointer bugs and 1308
                 unnecessary NULL tests in Linux 4.4.0-rc5, PostgreSQL
                 8.3.9, and Apache httpd 2.2.18.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Ren:2017:SDH,
  author =       "Ao Ren and Zhe Li and Caiwen Ding and Qinru Qiu and
                 Yanzhi Wang and Ji Li and Xuehai Qian and Bo Yuan",
  title =        "{SC-DCNN}: Highly-Scalable Deep Convolutional Neural
                 Network using Stochastic Computing",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "405--418",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037746",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the recent advance of wearable devices and
                 Internet of Things (IoTs), it becomes attractive to
                 implement the Deep Convolutional Neural Networks
                 (DCNNs) in embedded and portable systems. Currently,
                 executing the software-based DCNNs requires
                 high-performance servers, restricting the widespread
                 deployment on embedded and mobile IoT devices. To
                 overcome this obstacle, considerable research efforts
                 have been made to develop highly-parallel and
                 specialized DCNN accelerators using GPGPUs, FPGAs or
                 ASICs. Stochastic Computing (SC), which uses a
                 bit-stream to represent a number within [-1, 1] by
                 counting the number of ones in the bit-stream, has high
                 potential for implementing DCNNs with high scalability
                 and ultra-low hardware footprint. Since multiplications
                 and additions can be calculated using AND gates and
                 multiplexers in SC, significant reductions in power
                 (energy) and hardware footprint can be achieved
                 compared to the conventional binary arithmetic
                 implementations. The tremendous savings in power
                 (energy) and hardware resources allow immense design
                 space for enhancing scalability and robustness for
                 hardware DCNNs. This paper presents SC-DCNN, the first
                 comprehensive design and optimization framework of
                 SC-based DCNNs, using a bottom-up approach. We first
                 present the designs of function blocks that perform the
                 basic operations in DCNN, including inner product,
                 pooling, and activation function. Then we propose four
                 designs of feature extraction blocks, which are in
                 charge of extracting features from input feature maps,
                 by connecting different basic function blocks with
                 joint optimization. Moreover, the efficient weight
                 storage methods are proposed to reduce the area and
                 power (energy) consumption. Putting all together, with
                 feature extraction blocks carefully selected, SC-DCNN
                 is holistically optimized to minimize area and power
                 (energy) consumption while maintaining high network
                 accuracy. Experimental results demonstrate that the
                 LeNet5 implemented in SC-DCNN consumes only 17 mm$^2$
                 area and 1.53 W power, achieves throughput of 781250
                 images/s, area efficiency of 45946 images/s/ mm$^2$,
                 and energy efficiency of 510734 images/J.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Ajay:2017:GIL,
  author =       "Jerry Ajay and Chen Song and Aditya Singh Rathore and
                 Chi Zhou and Wenyao Xu",
  title =        "{$3$DGates}: an Instruction-Level Energy Analysis and
                 Optimization of {$3$D} Printers",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "419--433",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037752",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As the next-generation manufacturing driven force, 3D
                 printing technology is having a transformative effect
                 on various industrial domains and has been widely
                 applied in a broad spectrum of applications. It also
                 progresses towards other versatile fields with portable
                 battery-powered 3D printers working on a limited energy
                 budget. While reducing manufacturing energy is an
                 essential challenge in industrial sustainability and
                 national economics, this growing trend motivates us to
                 explore the energy consumption of the 3D printer for
                 the purpose of energy efficiency. To this end, we
                 perform an in-depth analysis of energy consumption in
                 commercial, off-the-shelf 3D printers from an
                 instruction-level perspective. We build an
                 instruction-level energy model and an energy profiler
                 to analyze the energy cost during the fabrication
                 process. From the insights obtained by the energy
                 profiler, we propose and implement a cross-layer energy
                 optimization solution, called 3DGates, which spans the
                 instruction-set, the compiler and the firmware. We
                 evaluate 3DGates over 338 benchmarks on a 3D printer
                 and achieve an overall energy reduction of 25\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Cox:2017:EAT,
  author =       "Guilherme Cox and Abhishek Bhattacharjee",
  title =        "Efficient Address Translation for Architectures with
                 Multiple Page Sizes",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "435--448",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037704",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Processors and operating systems (OSes) support
                 multiple memory page sizes. Superpages increase
                 Translation Lookaside Buffer (TLB) hits, while small
                 pages provide fine-grained memory protection. Ideally,
                 TLBs should perform well for any distribution of page
                 sizes. In reality, set-associative TLBs --- used
                 frequently for their energy efficiency compared to
                 fully-associative TLBs --- cannot (easily) support
                 multiple page sizes concurrently. Instead, commercial
                 systems typically implement separate set-associative
                 TLBs for different page sizes. This means that when
                 superpages are allocated aggressively, TLB misses may,
                 counter intuitively, increase even if entries for small
                 pages remain unused (and vice-versa). We invent MIX
                 TLBs, energy-frugal set-associative structures that
                 concurrently support all page sizes by exploiting
                 superpage allocation patterns. MIX TLBs boost the
                 performance (often by 10-30\%) of big-memory
                 applications on native CPUs, virtualized CPUs, and
                 GPUs. MIX TLBs are simple and require no OS or program
                 changes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Lesokhin:2017:PFS,
  author =       "Ilya Lesokhin and Haggai Eran and Shachar Raindel and
                 Guy Shapiro and Sagi Grimberg and Liran Liss and Muli
                 Ben-Yehuda and Nadav Amit and Dan Tsafrir",
  title =        "Page Fault Support for Network Controllers",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "449--466",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037710",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Direct network I/O allows network controllers (NICs)
                 to expose multiple instances of themselves, to be used
                 by untrusted software without a trusted intermediary.
                 Direct I/O thus frees researchers from legacy software,
                 fueling studies that innovate in multitenant setups.
                 Such studies, however, overwhelmingly ignore one
                 serious problem: direct memory accesses (DMAs) of NICs
                 disallow page faults, forcing systems to either pin
                 entire address spaces to physical memory and thereby
                 hinder memory utilization, or resort to APIs that
                 pin/unpin memory buffers before/after they are DMAed,
                 which complicates the programming model and hampers
                 performance. We solve this problem by designing and
                 implementing page fault support for InfiniBand and
                 Ethernet NICs. A main challenge we tackle---unique to
                 NICs---is handling receive DMAs that trigger page
                 faults, leaving the NIC without memory to store the
                 incoming data. We demonstrate that our solution
                 provides all the benefits associated with ``regular''
                 virtual memory, notably (1) a simpler programming model
                 that rids users from the need to pin, and (2) the
                 ability to employ all the canonical memory
                 optimizations, such as memory overcommitment and
                 demand-paging based on actual use. We show that, as a
                 result, benchmark performance improves by up to 1.9x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Hu:2017:TFC,
  author =       "Yang Hu and Mingcong Song and Tao Li",
  title =        "Towards {``Full Containerization''} in Containerized
                 Network Function Virtualization",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "467--481",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037713",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With exploding traffic stuffing existing network
                 infra-structure, today's telecommunication and cloud
                 service providers resort to Network Function
                 Virtualization (NFV) for greater agility and economics.
                 Pioneer service provider such as AT{\&}T proposes to
                 adopt container in NFV to achieve shorter Virtualized
                 Network Function (VNF) provisioning time and better
                 runtime performance. However, we characterize typical
                 NFV work-loads on the containers and find that the
                 performance is unsatisfactory. We observe that the
                 shared host OS net-work stack is the main bottleneck,
                 where the traffic flow processing involves a large
                 amount of intermediate memory buffers and results in
                 significant last level cache pollution. Existing OS
                 memory allocation policies fail to exploit the locality
                 and data sharing information among buffers. In this
                 paper, we propose NetContainer, a software framework
                 that achieves fine-grained hardware resource management
                 for containerized NFV platform. NetContainer employs a
                 cache access overheads guided page coloring scheme to
                 coordinately address the inter-flow cache access
                 overheads and intra-flow cache access overheads. It
                 maps the memory buffer pages that manifest low cache
                 access overheads (across a flow or among the flows) to
                 the same last level cache partition. NetContainer
                 exploits a footprint theory based method to estimate
                 the cache access overheads and a Min-Cost Max-Flow
                 model to guide the memory buffer mappings. We implement
                 the NetContainer in Linux kernel and extensively
                 evaluate it with real NFV workloads. Exper-imental
                 results show that NetContainer outperforms conventional
                 page coloring-based memory allocator by 48\% in terms
                 of successful call rate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Wu:2017:FEF,
  author =       "Bo Wu and Xu Liu and Xiaobo Zhou and Changjun Jiang",
  title =        "{FLEP}: Enabling Flexible and Efficient Preemption on
                 {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "483--496",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037742",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "GPUs are widely adopted in HPC and cloud computing
                 platforms to accelerate general-purpose workloads.
                 However, modern GPUs do not support flexible
                 preemption, leading to performance and priority
                 inversion problems in multi-tasking environments. In
                 this paper, we propose and develop FLEP, the first
                 software system that enables flexible kernel preemption
                 and kernel scheduling on commodity GPUs. The FLEP
                 compilation engine transforms the GPU program into
                 preemptable forms, which can be interrupted during
                 execution and yield all or part of the streaming
                 multi-processors (SMs) in the GPU. The FLEP runtime
                 engine intercepts all kernel invocations and determines
                 which kernels and how those kernels should be preempted
                 and scheduled. Experimental results on two-kernel
                 co-runs demonstrate up to 24.2X speedup for
                 high-priority kernels and up to 27X improvement on
                 normalized average turnaround time for kernels with the
                 same priority. FLEP reduces the preemption latency by
                 up to 41\% compared to yielding the whole GPU when the
                 waiting kernels only need several SMs. With all the
                 benefits, FLEP only introduces 2.5\% runtime overhead,
                 which is substantially lower than the kernel slicing
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Li:2017:SSA,
  author =       "Kaiwei Li and Jianfei Chen and Wenguang Chen and Jun
                 Zhu",
  title =        "{SaberLDA}: Sparsity-Aware Learning of Topic Models on
                 {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "497--509",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037740",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Latent Dirichlet Allocation (LDA) is a popular tool
                 for analyzing discrete count data such as text and
                 images. Applications require LDA to handle both large
                 datasets and a large number of topics. Though
                 distributed CPU systems have been used, GPU-based
                 systems have emerged as a promising alternative because
                 of the high computational power and memory bandwidth of
                 GPUs. However, existing GPU-based LDA systems cannot
                 support a large number of topics because they use
                 algorithms on dense data structures whose time and
                 space complexity is linear to the number of topics. In
                 this paper, we propose SaberLDA, a GPU-based LDA system
                 that implements a sparsity-aware algorithm to achieve
                 sublinear time complexity and scales well to learn a
                 large number of topics. To address the challenges
                 introduced by sparsity, we propose a novel data layout,
                 a new warp-based sampling kernel, and an efficient
                 sparse count matrix updating algorithm that improves
                 locality, makes efficient utilization of GPU warps, and
                 reduces memory consumption. Experiments show that
                 SaberLDA can learn from billions-token-scale data with
                 up to 10,000 topics, which is almost two orders of
                 magnitude larger than that of the previous GPU-based
                 systems. With a single GPU card, SaberLDA is able to
                 learn 10,000 topics from a dataset of billions of
                 tokens in a few hours, which is only achievable with
                 clusters with tens of machines before.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Khazraee:2017:MNO,
  author =       "Moein Khazraee and Lu Zhang and Luis Vega and Michael
                 Bedford Taylor",
  title =        "{Moonwalk}: {NRE} Optimization in {ASIC} Clouds",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "511--526",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037749",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cloud services are becoming increasingly globalized
                 and data-center workloads are expanding exponentially.
                 GPU and FPGA-based clouds have illustrated improvements
                 in power and performance by accelerating
                 compute-intensive workloads. ASIC-based clouds are a
                 promising way to optimize the Total Cost of Ownership
                 (TCO) of a given datacenter computation (e.g. YouTube
                 transcoding) by reducing both energy consumption and
                 marginal computation cost. The feasibility of an ASIC
                 Cloud for a particular application is directly gated by
                 the ability to manage the Non-Recurring Engineering
                 (NRE) costs of designing and fabricating the ASIC, so
                 that it is significantly lower (e.g. 2X) than the TCO
                 of the best available alternative. In this paper, we
                 show that technology node selection is a major tool for
                 managing ASIC Cloud NRE, and allows the designer to
                 trade off an accelerator's excess energy efficiency and
                 cost performance for lower total cost. We explore NRE
                 and cross-technology optimization of ASIC Clouds for
                 four different applications: Bitcoin mining,
                 YouTube-style video transcoding, Litecoin, and Deep
                 Learning. We address these challenges and show large
                 reductions in the NRE, potentially enabling ASIC Clouds
                 to address a wider variety of datacenter workloads. Our
                 results suggest that advanced nodes like 16nm will lead
                 to sub-optimal TCO for many workloads, and that use of
                 older nodes like 65nm can enable a greater diversity of
                 ASIC Clouds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Park:2017:DRM,
  author =       "Jason Jong Kyu Park and Yongjun Park and Scott
                 Mahlke",
  title =        "Dynamic Resource Management for Efficient Utilization
                 of Multitasking {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "527--540",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037707",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As graphics processing units (GPUs) are broadly
                 adopted, running multiple applications on a GPU at the
                 same time is beginning to attract wide attention.
                 Recent proposals on multitasking GPUs have focused on
                 either spatial multitasking, which partitions GPU
                 resource at a streaming multiprocessor (SM)
                 granularity, or simultaneous multikernel (SMK), which
                 runs multiple kernels on the same SM. However,
                 multitasking performance varies heavily depending on
                 the resource partitions within each scheme, and the
                 application mixes. In this paper, we propose GPU
                 Maestro that performs dynamic resource management for
                 efficient utilization of multitasking GPUs. GPU Maestro
                 can discover the best performing GPU resource partition
                 exploiting both spatial multitasking and SMK.
                 Furthermore, dynamism within a kernel and interference
                 between the kernels are automatically considered
                 because GPU Maestro finds the best performing partition
                 through direct measurements. Evaluations show that GPU
                 Maestro can improve average system throughput by 20.2\%
                 and 13.9\% over the baseline spatial multitasking and
                 SMK, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Zhang:2017:ISC,
  author =       "Rui Zhang and Natalie Stanley and Christopher Griggs
                 and Andrew Chi and Cynthia Sturton",
  title =        "Identifying Security Critical Properties for the
                 Dynamic Verification of a Processor",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "541--554",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037734",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a methodology for identifying security
                 critical properties for use in the dynamic verification
                 of a processor. Such verification has been shown to be
                 an effective way to prevent exploits of vulnerabilities
                 in the processor, given a meaningful set of security
                 properties. We use known processor errata to establish
                 an initial set of security-critical invariants of the
                 processor. We then use machine learning to infer an
                 additional set of invariants that are not tied to any
                 particular, known vulnerability, yet are critical to
                 security. We build a tool chain implementing the
                 approach and evaluate it for the open-source OR1200
                 RISC processor. We find that our tool can identify 19
                 (86.4\%) of the 22 manually crafted security-critical
                 properties from prior work and generates 3 new security
                 properties not covered in prior work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Ferraiuolo:2017:VPH,
  author =       "Andrew Ferraiuolo and Rui Xu and Danfeng Zhang and
                 Andrew C. Myers and G. Edward Suh",
  title =        "Verification of a Practical Hardware Security
                 Architecture Through Static Information Flow Analysis",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "555--568",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037739",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hardware-based mechanisms for software isolation are
                 becoming increasingly popular, but implementing these
                 mechanisms correctly has proved difficult, undermining
                 the root of security. This work introduces an effective
                 way to formally verify important properties of such
                 hardware security mechanisms. In our approach, hardware
                 is developed using a lightweight security-typed
                 hardware description language (HDL) that performs
                 static information flow analysis. We show the
                 practicality of our approach by implementing and
                 verifying a simplified but realistic multi-core
                 prototype of the ARM TrustZone architecture. To make
                 the security-typed HDL expressive enough to verify a
                 realistic processor, we develop new type system
                 features. Our experiments suggest that information flow
                 analysis is efficient, and programmer effort is modest.
                 We also show that information flow constraints are an
                 effective way to detect hardware vulnerabilities,
                 including several found in commercial processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Chisnall:2017:CJS,
  author =       "David Chisnall and Brooks Davis and Khilan Gudka and
                 David Brazdil and Alexandre Joannou and Jonathan
                 Woodruff and A. Theodore Markettos and J. Edward Maste
                 and Robert Norton and Stacey Son and Michael Roe and
                 Simon W. Moore and Peter G. Neumann and Ben Laurie and
                 Robert N. M. Watson",
  title =        "{CHERI JNI}: Sinking the {Java} Security Model into
                 the {C}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "569--583",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037725",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Java provides security and robustness by building a
                 high-level security model atop the foundation of memory
                 protection. Unfortunately, any native code linked into
                 a Java program --- including the million lines used to
                 implement the standard library --- is able to bypass
                 both the memory protection and the higher-level
                 policies. We present a hardware-assisted implementation
                 of the Java native code interface, which extends the
                 guarantees required for Java's security model to native
                 code. Our design supports safe direct access to buffers
                 owned by the JVM, including hardware-enforced read-only
                 access where appropriate. We also present Java language
                 syntax to declaratively describe isolated compartments
                 for native code. We show that it is possible to
                 preserve the memory safety and isolation requirements
                 of the Java security model in C code, allowing native
                 code to run in the same process as Java code with the
                 same impact on security as running equivalent Java
                 code. Our approach has a negligible impact on
                 performance, compared with the existing unsafe native
                 code interface. We demonstrate a prototype
                 implementation running on the CHERI microprocessor
                 synthesized in FPGA.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Ge:2017:GGC,
  author =       "Xinyang Ge and Weidong Cui and Trent Jaeger",
  title =        "{GRIFFIN}: Guarding Control Flows Using {Intel}
                 Processor Trace",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "585--598",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037716",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Researchers are actively exploring techniques to
                 enforce control-flow integrity (CFI), which restricts
                 program execution to a predefined set of targets for
                 each indirect control transfer to prevent code-reuse
                 attacks. While hardware-assisted CFI enforcement may
                 have the potential for advantages in performance and
                 flexibility over software instrumentation, current
                 hardware-assisted defenses are either incomplete (i.e.,
                 do not enforce all control transfers) or less efficient
                 in comparison. We find that the recent introduction of
                 hardware features to log complete control-flow traces,
                 such as Intel Processor Trace (PT), provides an
                 opportunity to explore how efficient and flexible a
                 hardware-assisted CFI enforcement system may become.
                 While Intel PT was designed to aid in offline debugging
                 and failure diagnosis, we explore its effectiveness for
                 online CFI enforcement over unmodified binaries by
                 designing a parallelized method for enforcing various
                 types of CFI policies. We have implemented a prototype
                 called GRIFFIN in the Linux 4.2 kernel that enables
                 complete CFI enforcement over a variety of software,
                 including the Firefox browser and its jitted code. Our
                 experiments show that GRIFFIN can enforce fine-grained
                 CFI policies with shadow stack as recommended by
                 researchers at a performance that is comparable to
                 software-only instrumentation techniques. In addition,
                 we find that alternative logging approaches yield
                 significant performance improvements for trace
                 processing, identifying opportunities for further
                 hardware assistance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Delimitrou:2017:BKW,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "{Bolt}: {I} Know What You Did Last Summer\ldots{} In
                 The Cloud",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "599--613",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037703",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cloud providers routinely schedule multiple
                 applications per physical host to increase efficiency.
                 The resulting interference on shared resources often
                 leads to performance degradation and, more importantly,
                 security vulnerabilities. Interference can leak
                 important information ranging from a service's
                 placement to confidential data, like private keys. We
                 present Bolt, a practical system that accurately
                 detects the type and characteristics of applications
                 sharing a cloud platform based on the interference an
                 adversary sees on shared resources. Bolt leverages
                 online data mining techniques that only require 2-5
                 seconds for detection. In a multi-user study on EC2,
                 Bolt correctly identifies the characteristics of 385
                 out of 436 diverse workloads. Extracting this
                 information enables a wide spectrum of
                 previously-impractical cloud attacks, including denial
                 of service attacks (DoS) that increase tail latency by
                 140x, as well as resource freeing (RFA) and
                 co-residency attacks. Finally, we show that while
                 advanced isolation mechanisms, such as cache
                 partitioning lower detection accuracy, they are
                 insufficient to eliminate these vulnerabilities
                 altogether. To do so, one must either disallow core
                 sharing, or only allow it between threads of the same
                 application, leading to significant inefficiencies and
                 performance penalties.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Kang:2017:NCI,
  author =       "Yiping Kang and Johann Hauswald and Cao Gao and Austin
                 Rovinski and Trevor Mudge and Jason Mars and Lingjia
                 Tang",
  title =        "Neurosurgeon: Collaborative Intelligence Between the
                 Cloud and Mobile Edge",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "615--629",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037698",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The computation for today's intelligent personal
                 assistants such as Apple Siri, Google Now, and
                 Microsoft Cortana, is performed in the cloud. This
                 cloud-only approach requires significant amounts of
                 data to be sent to the cloud over the wireless network
                 and puts significant computational pressure on the
                 datacenter. However, as the computational resources in
                 mobile devices become more powerful and energy
                 efficient, questions arise as to whether this
                 cloud-only processing is desirable moving forward, and
                 what are the implications of pushing some or all of
                 this compute to the mobile devices on the edge. In this
                 paper, we examine the status quo approach of cloud-only
                 processing and investigate computation partitioning
                 strategies that effectively leverage both the cycles in
                 the cloud and on the mobile device to achieve low
                 latency, low energy consumption, and high datacenter
                 throughput for this class of intelligent applications.
                 Our study uses 8 intelligent applications spanning
                 computer vision, speech, and natural language domains,
                 all employing state-of-the-art Deep Neural Networks
                 (DNNs) as the core machine learning technique. We find
                 that given the characteristics of DNN algorithms, a
                 fine-grained, layer-level computation partitioning
                 strategy based on the data and computation variations
                 of each layer within a DNN has significant latency and
                 energy advantages over the status quo approach. Using
                 this insight, we design Neurosurgeon, a lightweight
                 scheduler to automatically partition DNN computation
                 between mobile devices and datacenters at the
                 granularity of neural network layers. Neurosurgeon does
                 not require per-application profiling. It adapts to
                 various DNN architectures, hardware platforms, wireless
                 networks, and server load levels, intelligently
                 partitioning computation for best latency or best
                 mobile energy. We evaluate Neurosurgeon on a
                 state-of-the-art mobile development platform and show
                 that it improves end-to-end latency by 3.1X on average
                 and up to 40.7X, reduces mobile energy consumption by
                 59.5\% on average and up to 94.7\%, and improves
                 datacenter throughput by 1.5X on average and up to
                 6.7X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Agarwal:2017:TAT,
  author =       "Neha Agarwal and Thomas F. Wenisch",
  title =        "{Thermostat}: Application-transparent Page Management
                 for Two-tiered Main Memory",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "631--644",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037706",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The advent of new memory technologies that are denser
                 and cheaper than commodity DRAM has renewed interest in
                 two-tiered main memory schemes. Infrequently accessed
                 application data can be stored in such memories to
                 achieve significant memory cost savings. Past research
                 on two-tiered main memory has assumed a 4KB page size.
                 However, 2MB huge pages are performance critical in
                 cloud applications with large memory footprints,
                 especially in virtualized cloud environments, where
                 nested paging drastically increases the cost of 4KB
                 page management. We present Thermostat, an
                 application-transparent huge-page-aware mechanism to
                 place pages in a dual-technology hybrid memory system
                 while achieving both the cost advantages of two-tiered
                 memory and performance advantages of transparent huge
                 pages. We present an online page classification
                 mechanism that accurately classifies both 4KB and 2MB
                 pages as hot or cold while incurring no observable
                 performance overhead across several representative
                 cloud applications. We implement Thermostat in Linux
                 kernel version 4.5 and evaluate its effectiveness on
                 representative cloud computing workloads running under
                 KVM virtualization. We emulate slow memory with
                 performance characteristics approximating near-future
                 high-density memory technology and show that Thermostat
                 migrates up to 50\% of application footprint to slow
                 memory while limiting performance degradation to 3\%,
                 thereby reducing memory cost up to 30\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Barbalace:2017:BBH,
  author =       "Antonio Barbalace and Robert Lyerly and Christopher
                 Jelesnianski and Anthony Carno and Ho-Ren Chuang and
                 Vincent Legout and Binoy Ravindran",
  title =        "Breaking the Boundaries in Heterogeneous-{ISA}
                 Datacenters",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "645--659",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037738",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy efficiency is one of the most important design
                 considerations in running modern datacenters.
                 Datacenter operating systems rely on software
                 techniques such as execution migration to achieve
                 energy efficiency across pools of machines. Execution
                 migration is possible in datacenters today because they
                 consist mainly of homogeneous-ISA machines. However,
                 recent market trends indicate that alternate ISAs such
                 as ARM and PowerPC are pushing into the datacenter,
                 meaning current execution migration techniques are no
                 longer applicable. How can execution migration be
                 applied in future heterogeneous-ISA datacenters? In
                 this work we present a compiler, runtime, and an
                 operating system extension for enabling execution
                 migration between heterogeneous-ISA servers. We present
                 a new multi-ISA binary architecture and
                 heterogeneous-OS containers for facilitating efficient
                 migration of natively-compiled applications. We build
                 and evaluate a prototype of our design and demonstrate
                 energy savings of up to 66\% for a workload running on
                 an ARM and an x86 server interconnected by a high-speed
                 network.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Lustig:2017:ASC,
  author =       "Daniel Lustig and Andrew Wright and Alexandros
                 Papakonstantinou and Olivier Giroux",
  title =        "Automated Synthesis of Comprehensive Memory Model
                 Litmus Test Suites",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "661--675",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037723",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The memory consistency model is a fundamental part of
                 any shared memory architecture or programming model.
                 Modern weak memory models are notoriously difficult to
                 define and to implement correctly. Most real-world
                 programming languages, compilers, and
                 (micro)architectures therefore rely heavily on
                 black-box testing methodologies. The success of such
                 techniques requires that the suite of litmus tests used
                 to perform the testing be comprehensive--it should
                 ideally stress all obscure corner cases of the model
                 and of its implementation. Most litmus test suites
                 today are generated from some combination of manual
                 effort and randomization; however, the complex and
                 subtle nature of contemporary memory models means that
                 manual effort is both error-prone and subject to
                 incomplete coverage. This paper presents a methodology
                 for synthesizing comprehensive litmus test suites
                 directly from a memory model specification. By
                 construction, these suites contain all tests satisfying
                 a minimality criterion: that no synchronization
                 mechanism in the test can be weakened without causing
                 new behaviors to become observable. We formalize this
                 notion using the Alloy modeling language, and we apply
                 it to a number of existing and newly-proposed memory
                 models. Our results show not only that this synthesis
                 technique can automatically reproduce all
                 manually-generated tests from existing suites, but also
                 that it discovers new tests that are not as well
                 studied.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Liu:2017:DAD,
  author =       "Haopeng Liu and Guangpu Li and Jeffrey F. Lukman and
                 Jiaxin Li and Shan Lu and Haryadi S. Gunawi and Chen
                 Tian",
  title =        "{DCatch}: Automatically Detecting Distributed
                 Concurrency Bugs in Cloud Systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "677--691",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037735",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In big data and cloud computing era, reliability of
                 distributed systems is extremely important.
                 Unfortunately, distributed concurrency bugs, referred
                 to as DCbugs, widely exist. They hide in the large
                 state space of distributed cloud systems and manifest
                 non-deterministically depending on the timing of
                 distributed computation and communication. Effective
                 techniques to detect DCbugs are desired. This paper
                 presents a pilot solution, DCatch, in the world of
                 DCbug detection. DCatch predicts DCbugs by analyzing
                 correct execution of distributed systems. To build
                 DCatch, we design a set of happens-before rules that
                 model a wide variety of communication and concurrency
                 mechanisms in real-world distributed cloud systems. We
                 then build runtime tracing and trace analysis tools to
                 effectively identify concurrent conflicting memory
                 accesses in these systems. Finally, we design tools to
                 help prune false positives and trigger DCbugs. We have
                 evaluated DCatch on four representative open-source
                 distributed cloud systems, Cassandra, Hadoop MapReduce,
                 HBase, and ZooKeeper. By monitoring correct execution
                 of seven workloads on these systems, DCatch reports 32
                 DCbugs, with 20 of them being truly harmful.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Mashtizadeh:2017:TPD,
  author =       "Ali Jos{\'e} Mashtizadeh and Tal Garfinkel and David
                 Terei and David Mazieres and Mendel Rosenblum",
  title =        "Towards Practical Default-On Multi-Core Record\slash
                 Replay",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "693--708",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037751",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Castor, a record/replay system for
                 multi-core applications that provides consistently low
                 and predictable overheads. With Castor, developers can
                 leave record and replay on by default, making it
                 practical to record and reproduce production bugs, or
                 employ fault tolerance to recover from hardware
                 failures. Castor is inspired by several observations:
                 First, an efficient mechanism for logging
                 non-deterministic events is critical for recording
                 demanding workloads with low overhead. Through careful
                 use of hardware we were able to increase log throughput
                 by 10x or more, e.g., we could record a server handling
                 10x more requests per second for the same record
                 overhead. Second, most applications can be recorded
                 without modifying source code by using the compiler to
                 instrument language level sources of non-determinism,
                 in conjunction with more familiar techniques like
                 shared library interposition. Third, while Castor
                 cannot deterministically replay all data races, this
                 limitation is generally unimportant in practice,
                 contrary to what prior work has assumed. Castor
                 currently supports applications written in C, C++, and
                 Go on FreeBSD. We have evaluated Castor on parallel and
                 server workloads, including a commercial implementation
                 of memcached in Go, which runs Castor in production.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Huang:2017:PSA,
  author =       "Jian Huang and Michael Allen-Bond and Xuechen Zhang",
  title =        "{Pallas}: Semantic-Aware Checking for Finding Deep
                 Bugs in Fast Path",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "709--722",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037743",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software optimization is constantly a serious concern
                 for developing high-performance systems. To accelerate
                 the workflow execution of a specific functionality,
                 software developers usually define and implement a fast
                 path to speed up the critical and commonly executed
                 functions in the workflow. However, producing a
                 bug-free fast path is nontrivial. Our study on the
                 Linux kernel discloses that a committed fast path can
                 have up to 19 follow-up patches for bug fixing, and
                 most of them are deep semantic bugs, which are
                 difficult to be pinpointed by existing bug-finding
                 tools. In this paper, we present such a new category of
                 software bugs based on our fast-path bug study across
                 various system software including virtual memory
                 manager, file systems, network, and device drivers. We
                 investigate their root causes and identify five
                 error-prone aspects in a fast path: path state, trigger
                 condition, path output, fault handling, and assistant
                 data structure. We find that many of the deep bugs can
                 be prevented by applying static analysis incorporating
                 simple semantic information. We extract a set of rules
                 based on our findings and build a toolkit PALLAS to
                 check fast-path bugs. The evaluation results show that
                 PALLAS can effectively reveal fast-path bugs in a
                 variety of systems including Linux kernel, mobile
                 operating system, software-defined networking system,
                 and web browser.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Kotra:2017:HSC,
  author =       "Jagadish B. Kotra and Narges Shahidi and Zeshan A.
                 Chishti and Mahmut T. Kandemir",
  title =        "Hardware-Software Co-design to Mitigate {DRAM} Refresh
                 Overheads: a Case for Refresh-Aware Process
                 Scheduling",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "723--736",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037724",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "DRAM cells need periodic refresh to maintain data
                 integrity. With high capacity DRAMs, DRAM refresh poses
                 a significant performance bottleneck as the number of
                 rows to be refreshed (and hence the refresh cycle time,
                 tRFC) with each refresh command increases. Modern day
                 DRAMs perform refresh at a rank-level, while LPDDRs
                 used in mobile environments support refresh at a
                 per-bank level. Rank-level refresh degrades the
                 performance significantly since none of the banks in a
                 rank can serve the on-demand requests. Per-bank refresh
                 alleviates some of the performance bottlenecks as the
                 other banks in a rank are available for on-demand
                 requests. Typical DRAM retention time is in the order
                 several of milliseconds, viz, 64msec for environments
                 operating in temperatures below 85 deg C and 32msec for
                 environments operating above 85 deg C. With systems
                 moving towards increased consolidation (ex: virtualized
                 environments), DRAM refresh becomes a significant
                 bottleneck as it reduces the available overall DRAM
                 bandwidth per task. In this work, we propose a
                 hardware-software co-design to mitigate DRAM refresh
                 overheads by exposing the hardware address mapping and
                 DRAM refresh schedule to the Operating System. We
                 propose a novel DRAM refresh-aware process scheduling
                 algorithm in OS which schedules applications on cores
                 such that none of the on-demand requests from the
                 application are stalled by refreshes. Extensive
                 evaluation of our proposed co-design on
                 multi-programmed SPEC CPU2006 workloads show
                 significant performance improvement compared to the
                 previously proposed hardware only approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Kim:2017:KPC,
  author =       "Jinchun Kim and Elvira Teran and Paul V. Gratz and
                 Daniel A. Jim{\'e}nez and Seth H. Pugsley and Chris
                 Wilkerson",
  title =        "Kill the Program Counter: Reconstructing Program
                 Behavior in the Processor Cache Hierarchy",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "737--749",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037701",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data prefetching and cache replacement algorithms have
                 been intensively studied in the design of high
                 performance microprocessors. Typically, the data
                 prefetcher operates in the private caches and does not
                 interact with the replacement policy in the shared
                 Last-Level Cache (LLC). Similarly, most replacement
                 policies do not consider demand and prefetch requests
                 as different types of requests. In particular, program
                 counter (PC)-based replacement policies cannot learn
                 from prefetch requests since the data prefetcher does
                 not generate a PC value. PC-based policies can also be
                 negatively affected by compiler optimizations. In this
                 paper, we propose a holistic cache management technique
                 called Kill-the-PC (KPC) that overcomes the weaknesses
                 of traditional prefetching and replacement policy
                 algorithms. KPC cache management has three novel
                 contributions. First, a prefetcher which approximates
                 the future use distance of prefetch requests based on
                 its prediction confidence. Second, a simple replacement
                 policy provides similar or better performance than
                 current state-of-the-art PC-based prediction using
                 global hysteresis. Third, KPC integrates prefetching
                 and replacement policy into a whole system which is
                 greater than the sum of its parts. Information from the
                 prefetcher is used to improve the performance of the
                 replacement policy and vice-versa. Finally, KPC removes
                 the need to propagate the PC through entire on-chip
                 cache hierarchy while providing a holistic cache
                 management approach with better performance than
                 state-of-the-art PC-, and non-PC-based schemes. Our
                 evaluation shows that KPC provides 8\% better
                 performance than the best combination of existing
                 prefetcher and replacement policy for multi-core
                 workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Gao:2017:TSE,
  author =       "Mingyu Gao and Jing Pu and Xuan Yang and Mark Horowitz
                 and Christos Kozyrakis",
  title =        "{TETRIS}: Scalable and Efficient Neural Network
                 Acceleration with {$3$D} Memory",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "751--764",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037702",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The high accuracy of deep neural networks (NNs) has
                 led to the development of NN accelerators that improve
                 performance by two orders of magnitude. However,
                 scaling these accelerators for higher performance with
                 increasingly larger NNs exacerbates the cost and energy
                 overheads of their memory systems, including the
                 on-chip SRAM buffers and the off-chip DRAM channels.
                 This paper presents the hardware architecture and
                 software scheduling and partitioning techniques for
                 TETRIS, a scalable NN accelerator using 3D memory.
                 First, we show that the high throughput and low energy
                 characteristics of 3D memory allow us to rebalance the
                 NN accelerator design, using more area for processing
                 elements and less area for SRAM buffers. Second, we
                 move portions of the NN computations close to the DRAM
                 banks to decrease bandwidth pressure and increase
                 performance and energy efficiency. Third, we show that
                 despite the use of small SRAM buffers, the presence of
                 3D memory simplifies dataflow scheduling for NN
                 computations. We present an analytical scheduling
                 scheme that matches the efficiency of schedules derived
                 through exhaustive search. Finally, we develop a hybrid
                 partitioning scheme that parallelizes the NN
                 computations over multiple accelerators. Overall, we
                 show that TETRIS improves mthe performance by 4.1x and
                 reduces the energy by 1.5x over NN accelerators with
                 conventional, low-power DRAM memory systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Song:2017:HBA,
  author =       "Wonjun Song and Gwangsun Kim and Hyungjoon Jung and
                 Jongwook Chung and Jung Ho Ahn and Jae W. Lee and John
                 Kim",
  title =        "History-Based Arbitration for Fairness in
                 Processor-Interconnect of {NUMA} Servers",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "765--777",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037753",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "NUMA (non-uniform memory access) servers are commonly
                 used in high-performance computing and datacenters.
                 Within each server, a processor-interconnect (e.g.,
                 Intel QPI, AMD HyperTransport) is used to communicate
                 between the different sockets or nodes. In this work,
                 we explore the impact of the processor-interconnect on
                 overall performance --- in particular, the performance
                 unfairness caused by processor-interconnect
                 arbitration. It is well known that locally-fair
                 arbitration does not guarantee globally-fair bandwidth
                 sharing as closer nodes receive more bandwidth in a
                 multi-hop network. However, this work demonstrates that
                 the opposite can occur in a commodity NUMA server where
                 remote nodes receive higher bandwidth (and perform
                 better). We analyze this problem and identify that this
                 occurs because of external concentration used in router
                 micro-architectures for processor-interconnects without
                 globally-aware arbitration. While accessing remote
                 memory can occur in any NUMA system, performance
                 unfairness (or performance variation) is more critical
                 in cloud computing and virtual machines with shared
                 resources. We demonstrate how this unfairness creates
                 significant performance variation when a workload is
                 executed on the Xen virtualization platform. We then
                 provide analysis using synthetic workloads to better
                 understand the source of unfairness and eliminate the
                 impact of other shared resources, including the shared
                 last-level cache and main memory. To provide fairness,
                 we propose a novel, history-based arbitration that
                 tracks the history of arbitration grants made in the
                 previous history window. A weighted arbitration is done
                 based on the history to provide global fairness.
                 Through simulations, we show our proposed history-based
                 arbitration can provide global fairness and minimize
                 the processor-interconnect performance unfairness at
                 low cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Misra:2017:ELT,
  author =       "Pulkit A. Misra and Jeffrey S. Chase and Johannes
                 Gehrke and Alvin R. Lebeck",
  title =        "Enabling Lightweight Transactions with Precision
                 Time",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "779--794",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037722",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Distributed transactional storage is an important
                 service in today's data centers. Achieving high
                 performance without high complexity is often a
                 challenge for these systems due to sophisticated
                 consistency protocols and multiple layers of
                 abstraction. In this paper we show how to combine two
                 emerging technologies---Software-Defined Flash (SDF)
                 and precise synchronized clocks---to improve
                 performance and reduce complexity for transactional
                 storage within the data center. We present a
                 distributed transactional system (called MILANA) as a
                 layer above a durable multi-version key-value store
                 (called SEMEL) for read-heavy workloads within a data
                 center. SEMEL exploits write behavior of SSDs to
                 maintain a time-ordered sequence of versions for each
                 key efficiently and durably. MILANA adds a variant of
                 optimistic concurrency control above SEMEL's API to
                 service read requests from a consistent snapshot and to
                 enable clients to make fast local commit or abort
                 decisions for read-only transactions. Experiments with
                 the prototype reveal up to 43\% lower transaction abort
                 rates using IEEE Precision Time Protocol (PTP) vs. the
                 standard Network Time Protocol (NTP). Under the Retwis
                 benchmark, client-local validation of read-only
                 transactions yields a 35\% reduction in latency and
                 55\% increase in transaction throughput.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Liu:2017:ITN,
  author =       "Ming Liu and Liang Luo and Jacob Nelson and Luis Ceze
                 and Arvind Krishnamurthy and Kishore Atreya",
  title =        "{IncBricks}: Toward In-Network Computation with an
                 In-Network Cache",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "795--809",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037731",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The emergence of programmable network devices and the
                 increasing data traffic of datacenters motivate the
                 idea of in-network computation. By offloading compute
                 operations onto intermediate networking devices (e.g.,
                 switches, network accelerators, middleboxes), one can
                 (1) serve network requests on the fly with low latency;
                 (2) reduce datacenter traffic and mitigate network
                 congestion; and (3) save energy by running servers in a
                 low-power mode. However, since (1) existing switch
                 technology doesn't provide general computing
                 capabilities, and (2) commodity datacenter networks are
                 complex (e.g., hierarchical fat-tree topologies,
                 multipath communication), enabling in-network
                 computation inside a datacenter is challenging. In this
                 paper, as a step towards in-network computing, we
                 present IncBricks, an in-network caching fabric with
                 basic computing primitives. IncBricks is a
                 hardware-software co-designed system that supports
                 caching in the network using a programmable network
                 middlebox. As a key-value store accelerator, our
                 prototype lowers request latency by over 30\% and
                 doubles throughput for 1024 byte values in a common
                 cluster configuration. Our results demonstrate the
                 effectiveness of in-network computing and that
                 efficient datacenter network request processing is
                 possible if we carefully split the computation across
                 the different programmable computing elements in a
                 datacenter, including programmable switches, network
                 accelerators, and end hosts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Akturk:2017:AAA,
  author =       "Ismail Akturk and Ulya R. Karpuzcu",
  title =        "{AMNESIAC}: Amnesic Automatic Computer",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "811--824",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037741",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Due to imbalances in technology scaling, the energy
                 consumption of data storage and communication by far
                 exceeds the energy consumption of actual data
                 production, i.e., computation. As a consequence,
                 recomputing data can become more energy efficient than
                 storing and retrieving precomputed data. At the same
                 time, recomputation can relax the pressure on the
                 memory hierarchy and the communication bandwidth. This
                 study hence assesses the energy efficiency prospects of
                 trading computation for communication. We introduce an
                 illustrative proof-of-concept design, identify
                 practical limitations, and provide design guidelines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Bai:2017:VRE,
  author =       "Yuxin Bai and Victor W. Lee and Engin Ipek",
  title =        "Voltage Regulator Efficiency Aware Power Management",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "825--838",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037717",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Conventional off-chip voltage regulators are typically
                 bulky and slow, and are inefficient at exploiting
                 system and workload variability using Dynamic Voltage
                 and Frequency Scaling (DVFS). On-die integration of
                 voltage regulators has the potential to increase the
                 energy efficiency of computer systems by enabling power
                 control at a fine granularity in both space and time.
                 The energy conversion efficiency of on-chip regulators,
                 however, is typically much lower than off-chip
                 regulators, which results in significant energy losses.
                 Fine-grained power control and high voltage regulator
                 efficiency are difficult to achieve simultaneously,
                 with either emerging on-chip or conventional off-chip
                 regulators. A voltage conversion framework that relies
                 on a hierarchy of off-chip switching regulators and
                 on-chip linear regulators is proposed to enable
                 fine-grained power control with a regulator efficiency
                 greater than 90\%. A DVFS control policy that is based
                 on a reinforcement learning (RL) approach is developed
                 to exploit the proposed framework. Per-core RL agents
                 learn and improve their control policies independently,
                 while retaining the ability to coordinate their actions
                 to accomplish system level power management objectives.
                 When evaluated on a mix of 14 parallel and 13
                 multiprogrammed workloads, the proposed voltage
                 conversion framework achieves 18\% greater energy
                 efficiency than a conventional framework that uses
                 on-chip switching regulators. Moreover, when the RL
                 based DVFS control policy is used to control the
                 proposed voltage conversion framework, the system
                 achieves a 21\% higher energy efficiency over a
                 baseline oracle policy with coarse-grained power
                 control capability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Heinze:2017:TSA,
  author =       "Thomas S. Heinze and Anders M{\o}ller and Fabio
                 Strocco",
  title =        "Type safety analysis for {Dart}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "1--12",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989226",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Optional typing is traditionally viewed as a
                 compromise between static and dynamic type checking,
                 where code without type annotations is not checked
                 until runtime. We demonstrate that optional type
                 annotations in Dart programs can be integrated into a
                 flow analysis to provide static type safety guarantees
                 both for annotated and non-annotated parts of the code.
                 We explore two approaches: one that uses type
                 annotations for filtering, and one that uses them as
                 specifications. What makes this particularly
                 challenging for Dart is that its type system is unsound
                 even for fully annotated code. Experimental results
                 show that the technique is remarkably effective, even
                 without context sensitivity: 99.3\% of all property
                 lookup operations are reported type safe in a
                 collection of benchmark programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Mezzetti:2017:TUP,
  author =       "Gianluca Mezzetti and Anders M{\o}ller and Fabio
                 Strocco",
  title =        "Type unsoundness in practice: an empirical study of
                 {Dart}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "13--24",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989227",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The type system in the Dart programming language is
                 deliberately designed to be unsound: for a number of
                 reasons, it may happen that a program encounters type
                 errors at runtime although the static type checker
                 reports no warnings. According to the language
                 designers, this ensures a pragmatic balance between the
                 ability to catch bugs statically and allowing a
                 flexible programming style without burdening the
                 programmer with a lot of spurious type warnings. In
                 this work, we attempt to experimentally validate these
                 design choices. Through an empirical evaluation based
                 on open source programs written in Dart totaling 2.4 M
                 LOC, we explore how alternative, more sound choices
                 affect the type warnings being produced. Our results
                 show that some, but not all, sources of unsoundness can
                 be justified. In particular, we find that unsoundness
                 caused by bivariant function subtyping and method
                 overriding does not seem to help programmers. Such
                 information may be useful when designing future
                 versions of the language or entirely new languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Park:2017:PSS,
  author =       "Changhee Park and Hyeonseung Im and Sukyoung Ryu",
  title =        "Precise and scalable static analysis of {jQuery} using
                 a regular expression domain",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "25--36",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989228",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "jQuery is the most popular JavaScript library but the
                 state-of-the-art static analyzers for JavaScript
                 applications fail to analyze simple programs that use
                 jQuery. In this paper, we present a novel abstract
                 string domain whose elements are simple regular
                 expressions that can represent prefix, infix, and
                 postfix substrings of a string and even their sets. We
                 formalize the new domain in the abstract interpretation
                 framework with abstract models of strings and objects
                 commonly used in the existing JavaScript analyzers. For
                 practical use of the domain, we present polynomial-time
                 inclusion decision rules between the regular
                 expressions and prove that the rules exactly capture
                 the actual inclusion relation. We have implemented the
                 domain as an extension of the open-source JavaScript
                 analyzer, SAFE, and we show that the extension
                 significantly improves the scalability and precision of
                 the baseline analyzer in analyzing programs that use
                 jQuery.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{DeWael:2017:JTI,
  author =       "Mattias {De Wael} and Janwillem Swalens and Wolfgang
                 {De Meuter}",
  title =        "Just-in-time inheritance: a dynamic and implicit
                 multiple inheritance mechanism",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "37--47",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989229",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multiple inheritance is often criticised for the
                 ambiguity that arises when multiple parents want to
                 pass on a feature with the same name to their
                 offspring. A survey of programming languages reveals
                 that no programming language has an inherently implicit
                 and dynamic approach to resolve this ambiguity. This
                 paper identifies just-in-time inheritance as the first
                 implicit and dynamic inheritance mechanism. The key
                 idea of just-in-time inheritance is that one of the
                 parents is favoured over the others, which resolves the
                 ambiguity, and that the favoured parent can change at
                 runtime. However, just-in-time inheritance is not the
                 silver bullet to solve all ambiguity problems heir to
                 multiple inheritance, because it is not applicable in
                 all scenarios. We conclude that the applicability of
                 just-in-time inheritance is to be found in systems
                 where multiple inheritance is used to model an ``is-a
                 OR is-a''-relation, rather than the more traditional
                 ``is-a AND is-a''-relation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Meier:2017:PVM,
  author =       "Remigius Meier and Armin Rigo and Thomas R. Gross",
  title =        "Parallel virtual machines with {RPython}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "48--59",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989233",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The RPython framework takes an interpreter for a
                 dynamic language as its input and produces a Virtual
                 Machine{\^A} (VM) for that language. RPython is being
                 used to develop PyPy, a high-performance Python
                 interpreter. However, the produced VM does not support
                 parallel execution since the framework relies on a
                 Global Interpreter Lock{\^A} (GIL): PyPy serialises the
                 execution of multi-threaded Python programs. We
                 describe the rationale and design of a new parallel
                 execution model for RPython that allows the generation
                 of parallel virtual machines while leaving the language
                 semantics unchanged. This model then allows different
                 implementations of concurrency control, and we discuss
                 an implementation based on a GIL and an implementation
                 based on Software Transactional Memory{\^A} (STM). To
                 evaluate the benefits of either choice, we adapt PyPy
                 to work with both implementations (GIL and STM). The
                 evaluation shows that PyPy with STM improves the
                 runtime of a set of multi-threaded Python programs over
                 PyPy with a GIL by factors in the range of 1.87 $
                 \times $ up to 5.96 $ \times $ when executing on a
                 processor with 8 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Chari:2017:BEH,
  author =       "Guido Chari and Diego Garbervetsky and Stefan Marr",
  title =        "Building efficient and highly run-time adaptable
                 virtual machines",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "60--71",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989234",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming language virtual machines (VMs) realize
                 language semantics, enforce security properties, and
                 execute applications efficiently. Fully Reflective
                 Execution Environments (EEs) are VMs that additionally
                 expose their whole structure and behavior to
                 applications. This enables developers to observe and
                 adapt VMs at run time. However, there is a belief that
                 reflective EEs are not viable for practical usages
                 because such flexibility would incur a high performance
                 overhead. To refute this belief, we built a reflective
                 EE on top of a highly optimizing dynamic compiler. We
                 introduced a new optimization model that, based on the
                 conjecture that variability of low-level (EE-level)
                 reflective behavior is low in many scenarios, mitigates
                 the most significant sources of the performance
                 overheads related to the reflective capabilities in the
                 EE. Our experiments indicate that reflective EEs can
                 reach peak performance in the order of standard VMs.
                 Concretely, that (a) if reflective mechanisms are not
                 used the execution overhead is negligible compared to
                 standard VMs, (b) VM operations can be redefined at
                 language-level without incurring in significant
                 overheads, (c) for several software adaptation tasks,
                 applying the reflection at the VM level is not only
                 lightweight in terms of engineering effort, but also
                 competitive in terms of performance in comparison to
                 other ad-hoc solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Foley-Bourgon:2017:EIC,
  author =       "Vincent Foley-Bourgon and Laurie Hendren",
  title =        "Efficiently implementing the copy semantics of
                 {MATLAB}'s arrays in {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "72--83",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989235",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/matlab.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compiling MATLAB---a dynamic, array-based
                 language---to JavaScript is an attractive proposal: the
                 output code can be deployed on a platform used by
                 billions and can leverage the countless hours that have
                 gone into making JavaScript JIT engines fast. But
                 before that can happen, the original MATLAB code must
                 be properly translated, making sure to bridge the
                 semantic gaps of the two languages. An important area
                 where MATLAB and JavaScript differ is in their handling
                 of arrays: for example, in MATLAB, arrays are
                 one-indexed and writing at an index beyond the end of
                 an array extends it; in JavaScript, typed arrays are
                 zero-indexed and writing out of bounds is a no-op. A
                 MATLAB-to-JavaScript compiler must address these
                 mismatches. Another salient and pervasive difference
                 between the two languages is the assignment of arrays
                 to variables: in MATLAB, this operation has value
                 semantics, while in JavaScript is has reference
                 semantics. In this paper, we present MatJuice --- a
                 source-to-source, ahead-of-time compiler back-end for
                 MATLAB --- and how it deals efficiently with this last
                 issue. We present an intra-procedural data-flow
                 analysis to track where each array variable may point
                 to and which variables are possibly aliased. We also
                 present the associated copy insertion transformation
                 that uses the points-to information to insert explicit
                 copies when necessary. The resulting JavaScript program
                 respects the MATLAB value semantics and we show that it
                 performs fewer run-time copies than some alternative
                 approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Stadler:2017:ORL,
  author =       "Lukas Stadler and Adam Welc and Christian Humer and
                 Mick Jordan",
  title =        "Optimizing {R} language execution via aggressive
                 speculation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "84--95",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989236",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The R language, from the point of view of language
                 design and implementation, is a unique combination of
                 various programming language concepts. It has
                 functional characteristics like lazy evaluation of
                 arguments, but also allows expressions to have
                 arbitrary side effects. Many runtime data structures,
                 for example variable scopes and functions, are
                 accessible and can be modified while a program
                 executes. Several different object models allow for
                 structured programming, but the object models can
                 interact in surprising ways with each other and with
                 the base operations of R. R works well in practice, but
                 it is complex, and it is a challenge for language
                 developers trying to improve on the current
                 state-of-the-art, which is the reference implementation
                 --- GNU R. The goal of this work is to demonstrate
                 that, given the right approach and the right set of
                 tools, it is possible to create an implementation of
                 the R language that provides significantly better
                 performance while keeping compatibility with the
                 original implementation. In this paper we describe
                 novel optimizations backed up by aggressive speculation
                 techniques and implemented within FastR, an alternative
                 R language implementation, utilizing Truffle --- a
                 JVM-based language development framework developed at
                 Oracle Labs. We also provide experimental evidence
                 demonstrating effectiveness of these optimizations in
                 comparison with GNU R, as well as Renjin and TERR
                 implementations of the R language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Hemann:2017:SEL,
  author =       "Jason Hemann and Daniel P. Friedman and William E.
                 Byrd and Matthew Might",
  title =        "A small embedding of logic programming with a simple
                 complete search",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "96--107",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989230",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a straightforward, call-by-value embedding
                 of a small logic programming language with a simple
                 complete search. We construct the entire language in 54
                 lines of Racket --- half of which implement
                 unification. We then layer over it, in 43 lines, a
                 reconstruction of an existing logic programming
                 language, miniKanren, and attest to our
                 implementation's pedagogical value. Evidence suggests
                 our combination of expressiveness, concision, and
                 elegance is compelling: since microKanren's release, it
                 has spawned over 50 embeddings in over two dozen host
                 languages, including Go, Haskell, Prolog and
                 Smalltalk.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Warth:2017:MSA,
  author =       "Alessandro Warth and Patrick Dubroy and Tony
                 Garnock-Jones",
  title =        "Modular semantic actions",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "108--119",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989231",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parser generators give programmers a convenient and
                 declarative way to write parsers and other
                 language-processing applications, but their mechanisms
                 for extension and code reuse often leave something to
                 be desired. We introduce Ohm, a parser generator in
                 which both grammars and their interpretations can be
                 extended in safe and modular ways. Unlike many similar
                 tools, Ohm completely separates grammars and semantic
                 actions, avoiding the problems that arise when these
                 two concerns are mixed. This paper describes the
                 particular way in which Ohm achieves this separation,
                 and discusses the resulting benefits to modularity and
                 extensibility.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Marr:2017:CLC,
  author =       "Stefan Marr and Benoit Daloze and Hanspeter
                 M{\"o}ssenb{\"o}ck",
  title =        "Cross-language compiler benchmarking: are we fast
                 yet?",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "2",
  pages =        "120--131",
  month =        feb,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093334.2989232",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Comparing the performance of programming languages is
                 difficult because they differ in many aspects including
                 preferred programming abstractions, available
                 frameworks, and their runtime systems. Nonetheless, the
                 question about relative performance comes up repeatedly
                 in the research community, industry, and wider audience
                 of enthusiasts. This paper presents 14 benchmarks and a
                 novel methodology to assess the compiler effectiveness
                 across language implementations. Using a set of common
                 language abstractions, the benchmarks are implemented
                 in Java, JavaScript, Ruby, Crystal, Newspeak, and
                 Smalltalk. We show that the benchmarks exhibit a wide
                 range of characteristics using language-agnostic
                 metrics. Using four different languages on top of the
                 same compiler, we show that the benchmarks perform
                 similarly and therefore allow for a comparison of
                 compiler effectiveness across languages. Based on
                 anecdotes, we argue that these benchmarks help language
                 implementers to identify performance bugs and
                 optimization potential by comparing to other language
                 implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '16 conference proceedings.",
}

@Article{Rompf:2017:LMS,
  author =       "Tiark Rompf",
  title =        "Lightweight modular staging {(LMS)}: generate all the
                 things! (keynote)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "1--1",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993237",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent years have seen a surge of interest in staging
                 and generative programming, driven by the increasing
                 difficulty of making high-level code run fast on modern
                 hardware. While the mechanics of program generation are
                 relatively well understood, we have only begun to
                 understand how to develop systems in a generative way.
                 The Lightweight Modular Staging (LMS) platform forms
                 the core of a research agenda to make generative
                 programming more widely accessible, through powerful
                 libraries and a growing selection of case studies that
                 illuminate design patterns and crystallize best
                 practices for high-level and effective generative
                 programming. This talk will reflect on the foundations
                 of LMS, on applications, achievements, challenges, as
                 well as ongoing and future work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Biboudis:2017:RJD,
  author =       "Aggelos Biboudis and Pablo Inostroza and Tijs van der
                 Storm",
  title =        "{Recaf}: {Java} dialects as libraries",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "2--13",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993239",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mainstream programming languages like Java have
                 limited support for language extensibility. Without
                 mechanisms for syntactic abstraction, new programming
                 styles can only be embedded in the form of libraries,
                 limiting expressiveness. In this paper, we present
                 Recaf, a lightweight tool for creating Java dialects;
                 effectively extending Java with new language constructs
                 and user defined semantics. The Recaf compiler
                 generically transforms designated method bodies to code
                 that is parameterized by a semantic factory (Object
                 Algebra), defined in plain Java. The implementation of
                 such a factory defines the desired runtime semantics.
                 We applied our design to produce several examples from
                 a diverse set of programming styles and two case
                 studies: we define (i) extensions for generators,
                 asynchronous computations and asynchronous streams and
                 (ii) a Domain-Specific Language (DSL) for Parsing
                 Expression Grammars (PEGs), in a few lines of code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Wang:2017:CJ,
  author =       "Yanlin Wang and Haoyuan Zhang and Bruno C. d. S.
                 Oliveira and Marco Servetto",
  title =        "Classless {Java}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "14--24",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993238",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents an OO style without classes, which
                 we call interface-based object-oriented programming
                 (IB). IB is a natural extension of closely related
                 ideas such as traits. Abstract state operations provide
                 a new way to deal with state, which allows for
                 flexibility not available in class-based languages. In
                 IB state can be type-refined in subtypes. The
                 combination of a purely IB style and type-refinement
                 enables powerful idioms using multiple inheritance and
                 state. To introduce IB to programmers we created
                 Classless Java: an embedding of IB directly into Java.
                 Classless Java uses annotation processing for code
                 generation and relies on new features of Java 8 for
                 interfaces. The code generation techniques used in
                 Classless Java have interesting properties, including
                 guarantees that the generated code is type-safe and
                 good integration with IDEs. Usefulness of IB and
                 Classless Java is shown with examples and case
                 studies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Zacharopoulos:2017:EMM,
  author =       "Theologos Zacharopoulos and Pablo Inostroza and Tijs
                 van der Storm",
  title =        "Extensible modeling with managed data in {Java}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "25--35",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993240",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many model-driven development (MDD) tools employ
                 specialized frameworks and modeling languages, and
                 assume that the semantics of models is provided by some
                 form of code generation. As a result, programming
                 against models is cumbersome and does not integrate
                 well with ordinary programming languages and IDEs. In
                 this paper we present MD4J, a modeling approach for
                 embedding metamodels directly in Java, using plain
                 interfaces and annotations. The semantics is provided
                 by data managers that create and manipulate models.
                 This architecture enables two kinds of extensibility.
                 First, the data managers can be changed or extended to
                 obtain different base semantics of a model. This allows
                 a kind of aspect-oriented programming. Second, the
                 metamodels themselves can be extended with additional
                 fields and methods to modularly enrich a modeling
                 language. We illustrate our approach using the example
                 of state machines, discuss the implementation, and
                 evaluate it with two case-studies: the execution of UML
                 activity diagrams and an aspect-oriented refactoring of
                 JHotDraw.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Rosa:2017:APV,
  author =       "Andrea Ros{\`a} and Lydia Y. Chen and Walter Binder",
  title =        "Actor profiling in virtual execution environments",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "36--46",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993241",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nowadays, many virtual execution environments benefit
                 from concurrency offered by the actor model.
                 Unfortunately, while actors are used in many
                 applications, existing profiling tools are not much
                 effective in analyzing the performance of applications
                 using actors. In this paper, we present a new
                 instrumentation-based technique to profile actors in
                 virtual execution environments. Our technique adopts
                 platform-independent profiling metrics that minimize
                 the perturbations induced by the instrumentation logic
                 and allow comparing profiling results across different
                 platforms. In particular, our technique measures the
                 initialization cost, the amount of executed
                 computations, and the messages sent and received by
                 each actor. We implement our technique within a
                 profiling tool for Akka actors on the Java platform.
                 Evaluation results show that our profiling technique
                 helps performance analysis of actor utilization and
                 communication between actors in large-scale computing
                 frameworks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Konat:2017:BDS,
  author =       "Gabri{\"e}l Konat and Sebastian Erdweg and Eelco
                 Visser",
  title =        "Bootstrapping domain-specific meta-languages in
                 language workbenches",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "47--58",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993242",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is common practice to bootstrap compilers of
                 programming languages. By using the compiled language
                 to implement the compiler, compiler developers can code
                 in their own high-level language and gain a large-scale
                 test case. In this paper, we investigate bootstrapping
                 of compiler-compilers as they occur in language
                 workbenches. Language workbenches support the
                 development of compilers through the application of
                 multiple collaborating domain-specific meta-languages
                 for defining a language's syntax, analysis, code
                 generation, and editor support. We analyze the
                 bootstrapping problem of language workbenches in
                 detail, propose a method for sound bootstrapping based
                 on fixpoint compilation, and show how to conduct
                 breaking meta-language changes in a bootstrapped
                 language workbench. We have applied sound bootstrapping
                 to the Spoofax language workbench and report on our
                 experience.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Philips:2017:DDD,
  author =       "Laure Philips and Joeri {De Koster} and Wolfgang {De
                 Meuter} and Coen {De Roover}",
  title =        "Dependence-driven delimited {CPS} transformation for
                 {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "59--69",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993243",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In today's web applications asynchronous requests to
                 remote services using callbacks or futures are
                 omnipresent. The continuation of such a non-blocking
                 task is represented as a callback function that will
                 later be called with the result of the request. This
                 style of programming where the remainder of a
                 computation is captured in a continuation function is
                 called continuation-passing style (CPS). This style of
                 programming can quickly lead to a phenomenon called
                 ``callback hell'', which has a negative impact on the
                 maintainability of applications that employ this style.
                 Several alternatives to callbacks are therefore gaining
                 traction within the web domain. For example, there are
                 a number of frameworks that rely on automatically
                 transforming sequential style code into the
                 continuation-passing style. However, these frameworks
                 often employ a conservative approach in which each
                 function call is transformed into CPS. This
                 conservative approach can sequentialise requests that
                 could otherwise be run in parallel. So-called delimited
                 continuations can remedy, but require special marks
                 that have to be manually inserted in the code for
                 marking the beginning and end of the continuation. In
                 this paper we propose an alternative strategy in which
                 we apply a delimited CPS transformation that operates
                 on a Program Dependence Graph instead to find the
                 limits of each continuation.We implement this strategy
                 in JavaScript and demonstrate its applicability to
                 various web programming scenarios.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Lee:2017:SRE,
  author =       "Mina Lee and Sunbeom So and Hakjoo Oh",
  title =        "Synthesizing regular expressions from examples for
                 introductory automata assignments",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "70--80",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993244",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a method for synthesizing regular
                 expressions for introductory automata assignments.
                 Given a set of positive and negative examples, the
                 method automatically synthesizes the simplest possible
                 regular expression that accepts all the positive
                 examples while rejecting all the negative examples. The
                 key novelty is the search-based synthesis algorithm
                 that leverages ideas from over- and
                 under-approximations to effectively prune out a large
                 search space. We have implemented our technique in a
                 tool and evaluated it with non-trivial benchmark
                 problems that students often struggle with. The results
                 show that our system can synthesize desired regular
                 expressions in 6.7 seconds on the average, so that it
                 can be interactively used by students to enhance their
                 understanding of regular expressions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Omar:2017:PSF,
  author =       "Cyrus Omar and Jonathan Aldrich",
  title =        "Programmable semantic fragments: the design and
                 implementation of {\tt typy}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "81--92",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993245",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces typy, a statically typed
                 programming language embedded by reflection into
                 Python. typy features a fragmentary semantics, i.e. it
                 delegates semantic control over each term, drawn from
                 Python's fixed concrete and abstract syntax, to some
                 contextually relevant user-defined semantic fragment.
                 The delegated fragment programmatically (1) typechecks
                 the term (following a bidirectional protocol); and (2)
                 assigns dynamic meaning to the term by computing a
                 translation to Python. We argue that this design is
                 expressive with examples of fragments that express the
                 static and dynamic semantics of (1) functional records;
                 (2) labeled sums (with nested pattern matching a la
                 ML); (3) a variation on JavaScript's prototypal object
                 system; and (4) typed foreign interfaces to Python and
                 OpenCL. These semantic structures are, or would need to
                 be, defined primitively in conventionally structured
                 languages. We further argue that this design is
                 compositionally well-behaved. It avoids the expression
                 problem and the problems of grammar composition because
                 the syntax is fixed. Moreover, programs are
                 semantically stable under fragment composition (i.e.
                 defining a new fragment will not change the meaning of
                 existing program components.)",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Kienzle:2017:DDV,
  author =       "J{\"o}rg Kienzle and Gunter Mussbacher and Philippe
                 Collet and Omar Alam",
  title =        "Delaying decisions in variable concern hierarchies",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "93--103",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993246",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concern-Oriented Reuse (CORE) proposes a new way of
                 structuring model-driven software development, where
                 models of the system are modularized by domains of
                 abstraction within units of reuse called concerns.
                 Within a CORE concern, models are further decomposed
                 and modularized by features. This paper extends CORE
                 with a technique that enables developers of high-level
                 concerns to reuse lower-level concerns without
                 unnecessarily committing to a specific feature
                 selection. The developer can select the functionality
                 that is minimally needed to continue development, and
                 reexpose relevant alternative lower-level features of
                 the reused concern in the reusing concern's interface.
                 This effectively delays decision making about
                 alternative functionality until the higher-level reuse
                 context, where more detailed requirements are known and
                 further decisions can be made. The paper describes the
                 algorithms for composing the variation (i.e., feature
                 and impact models), customization, and usage interfaces
                 of a concern, as well as the concern's realization
                 models and finally an entire concern hierarchy, as is
                 necessary to support delayed decision making in CORE.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Adam:2017:ACG,
  author =       "Sorin Adam and Marco Kuhrmann and Ulrik Pagh Schultz",
  title =        "Automatic code generation in practice: experiences
                 with embedded robot controllers",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "104--108",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993247",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mobile robots often use a distributed architecture in
                 which software components are deployed to heterogeneous
                 hardware modules. Ensuring the consistency with the
                 designed architecture is a complex task, notably if
                 functional safety requirements have to be fulfilled. We
                 propose to use a domain-specific language to specify
                 those requirements and to allow for generating a
                 safety-enforcing layer of code, which is deployed to
                 the robot. The paper at hand reports experiences in
                 practically applying code generation to mobile robots.
                 For two cases, we discuss how we addressed challenges,
                 e.g., regarding weaving code generation into
                 proprietary development environments and testing of
                 manually written code. We find that a DSL based on the
                 same conceptual model can be used across different
                 kinds of hardware modules, but a significant adaptation
                 effort is required in practical scenarios involving
                 different kinds of hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Braz:2017:CCA,
  author =       "Larissa Braz and Rohit Gheyi and Melina Mongiovi and
                 M{\'a}rcio Ribeiro and Fl{\'a}vio Medeiros and Leopoldo
                 Teixeira",
  title =        "A change-centric approach to compile configurable
                 systems with {\tt \#ifdef}s",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "109--119",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993250",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Configurable systems typically use \#ifdefs to denote
                 variability. Generating and compiling all
                 configurations may be time-consuming. An alternative
                 consists of using variability-aware parsers, such as
                 TypeChef. However, they may not scale. In practice,
                 compiling the complete systems may be costly.
                 Therefore, developers can use sampling strategies to
                 compile only a subset of the configurations. We propose
                 a change-centric approach to compile configurable
                 systems with \#ifdefs by analyzing only configurations
                 impacted by a code change (transformation). We
                 implement it in a tool called CHECKCONFIGMX, which
                 reports the new compilation errors introduced by the
                 transformation. We perform an empirical study to
                 evaluate 3,913 transformations applied to the 14
                 largest files of BusyBox, Apache HTTPD, and Expat
                 configurable systems. CHECKCONFIGMX finds 595
                 compilation errors of 20 types introduced by 41
                 developers in 214 commits (5.46\% of the analyzed
                 transformations). In our study, it reduces by at least
                 50\% (an average of 99\%) the effort of evaluating the
                 analyzed transformations by comparing with the
                 exhaustive approach without considering a feature
                 model. CHECKCONFIGMX may help developers to reduce
                 compilation effort to evaluate fine-grained
                 transformations applied to configurable systems with
                 \#ifdefs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Pereira:2017:FBP,
  author =       "Juliana Alves Pereira and Pawel Matuszyk and Sebastian
                 Krieter and Myra Spiliopoulou and Gunter Saake",
  title =        "A feature-based personalized recommender system for
                 product-line configuration",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "120--131",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993249",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today's competitive marketplace requires the industry
                 to understand unique and particular needs of their
                 customers. Product line practices enable companies to
                 create individual products for every customer by
                 providing an interdependent set of features. Users
                 configure personalized products by consecutively
                 selecting desired features based on their individual
                 needs. However, as most features are interdependent,
                 users must understand the impact of their gradual
                 selections in order to make valid decisions. Thus,
                 especially when dealing with large feature models,
                 specialized assistance is needed to guide the users in
                 configuring their product. Recently, recommender
                 systems have proved to be an appropriate mean to assist
                 users in finding information and making decisions. In
                 this paper, we propose an advanced feature recommender
                 system that provides personalized recommendations to
                 users. In detail, we offer four main contributions: (i)
                 We provide a recommender system that suggests relevant
                 features to ease the decision-making process. (ii)
                 Based on this system, we provide visual support to
                 users that guides them through the decision-making
                 process and allows them to focus on valid and relevant
                 parts of the configuration space. (iii) We provide an
                 interactive open-source configurator tool encompassing
                 all those features. (iv) In order to demonstrate the
                 performance of our approach, we compare three different
                 recommender algorithms in two real case studies derived
                 from business experience.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Kowal:2017:EAF,
  author =       "Matthias Kowal and Sofia Ananieva and Thomas
                 Th{\"u}m",
  title =        "Explaining anomalies in feature models",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "132--143",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993248",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The development of variable software, in general, and
                 feature models, in particular, is an error-prone and
                 time-consuming task. It gets increasingly more
                 challenging with industrial-size models containing
                 hundreds or thousands of features and constraints. Each
                 change may lead to anomalies in the feature model such
                 as making some features impossible to select. While the
                 detection of anomalies is well-researched, giving
                 explanations is still a challenge. Explanations must be
                 as accurate and understandable as possible to support
                 the developer in repairing the source of an error. We
                 propose an efficient and generic algorithm for
                 explaining different anomalies in feature models.
                 Additionally, we achieve a benefit for the developer by
                 computing short explanations expressed in a
                 user-friendly manner and by emphasizing specific parts
                 in explanations that are more likely to be the cause of
                 an anomaly. We provide an open-source implementation in
                 FeatureIDE and show its scalability for industrial-size
                 feature models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Al-Hajjaji:2017:IEP,
  author =       "Mustafa Al-Hajjaji and Sebastian Krieter and Thomas
                 Th{\"u}m and Malte Lochau and Gunter Saake",
  title =        "{IncLing}: efficient product-line testing using
                 incremental pairwise sampling",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "144--155",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993253",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A software product line comprises a family of software
                 products that share a common set of features. It
                 enables customers to compose software systems from a
                 managed set of features. Testing every product of a
                 product line individually is often infeasible due to
                 the exponential number of possible products in the
                 number of features. Several approaches have been
                 proposed to restrict the number of products to be
                 tested by sampling a subset of products achieving
                 sufficient combinatorial interaction coverage. However,
                 existing sampling algorithms do not scale well to large
                 product lines, as they require a considerable amount of
                 time to generate the samples. Moreover, samples are not
                 available until a sampling algorithm completely
                 terminates. As testing time is usually limited, we
                 propose an incremental approach of product sampling for
                 pairwise interaction testing (called IncLing), which
                 enables developers to generate samples on demand in a
                 step-wise manner. Furthermore, IncLing uses heuristics
                 to efficiently achieve pairwise interaction coverage
                 with a reasonable number of products. We evaluated
                 IncLing by comparing it against existing sampling
                 algorithms using feature models of different sizes. The
                 results of our approach indicate efficiency
                 improvements for product-line testing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Rothberg:2017:TSC,
  author =       "Valentin Rothberg and Christian Dietrich and Andreas
                 Ziegler and Daniel Lohmann",
  title =        "Towards scalable configuration testing in variable
                 software",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "156--167",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993252",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Testing a software product line such as Linux implies
                 building the source with different configurations.
                 Manual approaches to generate configurations that
                 enable code of interest are doomed to fail due to the
                 high amount of variation points distributed over the
                 feature model, the build system and the source code.
                 Research has proposed various approaches to generate
                 covering configurations, but the algorithms show many
                 drawbacks related to run-time, exhaustiveness and the
                 amount of generated configurations. Hence, analyzing an
                 entire Linux source can yield more than 30 thousand
                 configurations and thereby exceeds the limited budget
                 and resources for build testing. In this paper, we
                 present an approach to fill the gap between a
                 systematic generation of configurations and the
                 necessity to fully build software in order to test it.
                 By merging previously generated configurations, we
                 reduce the number of necessary builds and enable global
                 variability-aware testing. We reduce the problem of
                 merging configurations to finding maximum cliques in a
                 graph. We evaluate the approach on the Linux kernel,
                 compare the results to common practices in industry,
                 and show that our implementation scales even when
                 facing graphs with millions of edges.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Steindorfer:2017:TSP,
  author =       "Michael J. Steindorfer and Jurgen J. Vinju",
  title =        "Towards a software product line of trie-based
                 collections",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "168--172",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993251",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Collection data structures in standard libraries of
                 programming languages are designed to excel for the
                 average case by carefully balancing memory footprint
                 and runtime performance. These implicit design
                 decisions and hard-coded trade-offs do constrain users
                 from using an optimal variant for a given problem.
                 Although a wide range of specialized collections is
                 available for the Java Virtual Machine (JVM), they
                 introduce yet another dependency and complicate user
                 adoption by requiring specific Application Program
                 Interfaces (APIs) incompatible with the standard
                 library. A product line for collection data structures
                 would relieve library designers from optimizing for the
                 general case. Furthermore, a product line allows
                 evolving the potentially large code base of a
                 collection family efficiently. The challenge is to find
                 a small core framework for collection data structures
                 which covers all variations without exhaustively
                 listing them, while supporting good performance at the
                 same time. We claim that the concept of Array Mapped
                 Tries (AMTs) embodies a high degree of commonality in
                 the sub-domain of immutable collection data structures.
                 AMTs are flexible enough to cover most of the
                 variability, while minimizing code bloat in the
                 generator and the generated code. We implemented a Data
                 Structure Code Generator (DSCG) that emits immutable
                 collections based on an AMT skeleton foundation. The
                 generated data structures outperform competitive
                 hand-optimized implementations, and the generator still
                 allows for customization towards specific workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Al-Hajjaji:2017:TDT,
  author =       "Mustafa Al-Hajjaji and Jens Meinicke and Sebastian
                 Krieter and Reimar Schr{\"o}ter and Thomas Th{\"u}m and
                 Thomas Leich and Gunter Saake",
  title =        "Tool demo: testing configurable systems with
                 {FeatureIDE}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "173--177",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993254",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most software systems are designed to provide custom
                 functionality using configuration options. Testing such
                 systems is challenging as running tests of a single
                 configuration is often not sufficient, because defects
                 may appear in other configurations. Ideally, all
                 configurations of a software system should be tested,
                 which is usually not applicable in practice due to the
                 combinatorial explosion with respect to the
                 configuration options. Multiple sampling strategies aim
                 to reduce the set of tested configurations to a
                 feasible amount, such as T-wise sampling, random
                 configurations, and user-defined configurations.
                 However, these strategies are often not applied in
                 practice as they require manual effort or a specialized
                 testing framework. Within our tool FeatureIDE, we
                 integrate all aforementioned strategies and reduce the
                 manual effort by automating the process of generating
                 and testing configurations. Furthermore, we provide
                 support for unit testing to avoid redundant test
                 executions and for variability-aware testing. With this
                 extension of FeatureIDE, we aim to make recent testing
                 techniques for configurable systems applicable in
                 practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Makki:2017:ART,
  author =       "Majid Makki and Dimitri {Van Landuyt} and Wouter
                 Joosen",
  title =        "Automated regression testing of {BPMN 2.0} processes:
                 a capture and replay framework for continuous
                 delivery",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "178--189",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993257",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Regression testing is a form of software quality
                 assurance (QA) that involves comparing the behavior of
                 a newer version of a software artifact to its earlier
                 correct behavior, and signaling the QA engineer when
                 deviations are detected. Given the large potential in
                 automated generation and execution of regression test
                 cases for business process models in the context of
                 running systems, powerful tools are required to make
                 this practically feasible, more specifically to limit
                 the potential impact on production systems, and to
                 reduce the manual effort required from QA engineers. In
                 this paper, we present a regression testing automation
                 framework that implements the capture {\&} replay
                 paradigm in the context of BPMN 2.0, a domain-specific
                 language for modeling and executing business processes.
                 The framework employs parallelization techniques and
                 efficient communication patterns to reduce the
                 performance overhead of capturing. Based on inputs from
                 the QA engineer, it manipulates the BPMN2 model before
                 executing tests for isolating the latter from external
                 dependencies (e.g. human actors or expensive web
                 services) and for avoiding undesired side-effects.
                 Finally, it performs a regression detection algorithm
                 and reports the results to the QA engineer. We have
                 implemented our framework on top of a BPMN2-compliant
                 execution engine, namely jBPM, and performed functional
                 validations and evaluations of its performance and
                 fault-tolerance. The results, indicating 3.9\% average
                 capturing performance overhead, demonstrate that the
                 implemented framework can be the foundation of a
                 practical regression testing tool for BPMN 2.0, and a
                 key enabler for continuous delivery of business
                 process-driven applications and services.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Hammer:2017:VOV,
  author =       "Matthew A. Hammer and Bor-Yuh Evan Chang and David
                 {Van Horn}",
  title =        "A vision for online verification-validation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "190--201",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993255",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today's programmers face a false choice between
                 creating software that is extensible and software that
                 is correct. Specifically, dynamic languages permit
                 software that is richly extensible (via dynamic code
                 loading, dynamic object extension, and various forms of
                 reflection), and today's programmers exploit this
                 flexibility to ``bring their own language features'' to
                 enrich extensible languages (e.g., by using common
                 JavaScript libraries). Meanwhile, such library-based
                 language extensions generally lack enforcement of their
                 abstractions, leading to programming errors that are
                 complex to avoid and predict. To offer verification for
                 this extensible world, we propose online
                 verification-validation (OVV), which consists of
                 language and VM design that enables a ``phaseless''
                 approach to program analysis, in contrast to the
                 standard static-dynamic phase distinction. Phaseless
                 analysis freely interposes abstract interpretation with
                 concrete execution, allowing analyses to use dynamic
                 (concrete) information to prove universal (abstract)
                 properties about future execution. In this paper, we
                 present a conceptual overview of OVV through a
                 motivating example program that uses a hypothetical
                 database library. We present a generic semantics for
                 OVV, and an extension to this semantics that offers a
                 simple gradual type system for the database library
                 primitives. The result of instantiating this gradual
                 type system in an OVV setting is a checker that can
                 progressively type successive continuations of the
                 program until a continuation is fully verified. To
                 evaluate the proposed vision of OVV for this example,
                 we implement the VM semantics (in Rust), and show that
                 this design permits progressive typing in this
                 manner.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Boussaa:2017:ANF,
  author =       "Mohamed Boussaa and Olivier Barais and Benoit Baudry
                 and Gerson Suny{\'e}",
  title =        "Automatic non-functional testing of code generators
                 families",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "3",
  pages =        "202--212",
  month =        mar,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093335.2993256",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The intensive use of generative programming techniques
                 provides an elegant engineering solution to deal with
                 the heterogeneity of platforms and technological
                 stacks. The use of domain-specific languages for
                 example, leads to the creation of numerous code
                 generators that automatically translate highlevel
                 system specifications into multi-target executable
                 code. Producing correct and efficient code generator is
                 complex and error-prone. Although software designers
                 provide generally high-level test suites to verify the
                 functional outcome of generated code, it remains
                 challenging and tedious to verify the behavior of
                 produced code in terms of non-functional properties.
                 This paper describes a practical approach based on a
                 runtime monitoring infrastructure to automatically
                 check the potential inefficient code generators. This
                 infrastructure, based on system containers as execution
                 platforms, allows code-generator developers to evaluate
                 the generated code performance. We evaluate our
                 approach by analyzing the performance of Haxe, a
                 popular high-level programming language that involves a
                 set of cross-platform code generators. Experimental
                 results show that our approach is able to detect some
                 performance inconsistencies that reveal real issues in
                 Haxe code generators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '16 conference proceedings.",
}

@Article{Abadi:2016:TLF,
  author =       "Mart{\'\i}n Abadi",
  title =        "{TensorFlow}: learning functions at scale",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "1--1",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2976746",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "TensorFlow is a machine learning system that operates
                 at large scale and in heterogeneous environments. Its
                 computational model is based on dataflow graphs with
                 mutable state. Graph nodes may be mapped to different
                 machines in a cluster, and within each machine to CPUs,
                 GPUs, and other devices. TensorFlow supports a variety
                 of applications, but it particularly targets training
                 and inference with deep neural networks. It serves as a
                 platform for research and for deploying machine
                 learning systems across many areas, such as speech
                 recognition, computer vision, robotics, information
                 retrieval, and natural language processing. In this
                 talk, we describe TensorFlow and outline some of its
                 applications. We also discuss the question of what
                 TensorFlow and deep learning may have to do with
                 functional programming. Although TensorFlow is not
                 purely functional, many of its uses are concerned with
                 optimizing functions (during training), then with
                 applying those functions (during inference). These
                 functions are defined as compositions of simple
                 primitives (as is common in functional programming),
                 with internal data representations that are learned
                 rather than manually designed. TensorFlow is joint work
                 with many other people in the Google Brain team and
                 elsewhere. More information is available at
                 tensorflow.org.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Ryu:2016:JFB,
  author =       "Sukyoung Ryu",
  title =        "Journey to find bugs in {JavaScript} web applications
                 in the wild",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "2--2",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2976747",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Analyzing real-world JavaScript web applications is a
                 challenging task. On top of understanding the semantics
                 of JavaScript, it requires modeling of web documents,
                 platform objects, and interactions between them. Not
                 only the JavaScript language itself but also its usage
                 patterns are extremely dynamic. JavaScript can generate
                 code and run it during evaluation, and most web
                 applications load JavaScript code dynamically. Such
                 dynamic characteristics of JavaScript web applications
                 make pure static analysis approaches inapplicable. In
                 this talk, we present our attempts to analyze
                 JavaScript web applications in the wild mostly
                 statically using various approaches. From pure
                 JavaScript programs to JavaScript web applications
                 using platform-specific libraries and dynamic code
                 loading, we explain technical challenges in analyzing
                 each of them and how we built an open-source analysis
                 framework for JavaScript, SAFE, that addresses the
                 challenges incrementally. In spite of active research
                 accomplishments in analysis of JavaScript web
                 applications, many issues still remain to be resolved
                 such as events, callback functions, and hybrid web
                 applications. We discuss possible future research
                 directions and open challenges.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Licata:2016:FPG,
  author =       "Dan Licata",
  title =        "A functional programmer's guide to homotopy type
                 theory",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "3--3",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2976748",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dependent type theories are functional programming
                 languages with types rich enough to do computer-checked
                 mathematics and software verification. Homotopy type
                 theory is a recent area of work that connects dependent
                 type theory to the mathematical disciplines of homotopy
                 theory and higher-dimensional category theory. From a
                 programming point of view, these connections have
                 revealed that all types in dependent type theory
                 support a certain generic program that had not
                 previously been exploited. Specifically, each type can
                 be equipped with computationally relevant witnesses of
                 equality of elements of that type, and all types
                 support a generic program that transports elements
                 along these equalities. One mechanism for equipping
                 types with non-trivial witnesses of equality is
                 Voevodsky's univalence axiom, which implies that
                 equality of types themselves is witnessed by type
                 isomorphism. Another is higher inductive types, an
                 extended datatype schema that allows identifications
                 between different datatype constructors. While these
                 new mechanisms were originally formulated as axiomatic
                 extensions of type theory, recent work has investigated
                 their computational meaning, leading to the development
                 of new programming languages that better support them.
                 In this talk, I will illustrate what univalence and
                 higher inductive types mean in programming terms. I
                 will also discuss how studying some related semantic
                 settings can reveal additional structure on types; for
                 example, moving from groupoids (categories where all
                 maps are invertible) to general categories yields an
                 account of coercions instead of equalities. Overall, I
                 hope to convey some of the beauty and richness of these
                 connections between disciplines, which we are just
                 beginning to understand.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Castro:2016:FPS,
  author =       "David Castro and Kevin Hammond and Susmit Sarkar",
  title =        "Farms, pipes, streams and reforestation: reasoning
                 about structured parallel processes using types and
                 hylomorphisms",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "4--17",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951920",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The increasing importance of parallelism has motivated
                 the creation of better abstractions for writing
                 parallel software, including structured parallelism
                 using nested algorithmic skeletons. Such approaches
                 provide high-level abstractions that avoid common
                 problems, such as race conditions, and often allow
                 strong cost models to be defined. However, choosing a
                 combination of algorithmic skeletons that yields good
                 parallel speedups for a program on some specific
                 parallel architecture remains a difficult task. In
                 order to achieve this, it is necessary to
                 simultaneously reason both about the costs of different
                 parallel structures and about the semantic equivalences
                 between them. This paper presents a new type-based
                 mechanism that enables strong static reasoning about
                 these properties. We exploit well-known properties of a
                 very general recursion pattern, hylomorphisms, and give
                 a denotational semantics for structured parallel
                 processes in terms of these hylomorphisms. Using our
                 approach, it is possible to determine formally whether
                 it is possible to introduce a desired parallel
                 structure into a program without altering its
                 functional behaviour, and also to choose a version of
                 that parallel structure that minimises some given cost
                 model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Acar:2016:DCC,
  author =       "Umut A. Acar and Arthur Chargu{\'e}raud and Mike
                 Rainey and Filip Sieczkowski",
  title =        "Dag-calculus: a calculus for parallel computation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "18--32",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951946",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Increasing availability of multicore systems has led
                 to greater focus on the design and implementation of
                 languages for writing parallel programs. Such languages
                 support various abstractions for parallelism, such as
                 fork-join, async-finish, futures. While they may seem
                 similar, these abstractions lead to different
                 semantics, language design and implementation
                 decisions, and can significantly impact the performance
                 of end-user applications. In this paper, we consider
                 the question of whether it would be possible to unify
                 various paradigms of parallel computing. To this end,
                 we propose a calculus, called dag calculus, that can
                 encode fork-join, async-finish, and futures, and
                 possibly others. We describe dag calculus and its
                 semantics, establish translations from the
                 aforementioned paradigms into dag calculus. These
                 translations establish that dag calculus is
                 sufficiently powerful for encoding programs written in
                 prevailing paradigms of parallelism. We present
                 concurrent algorithms and data structures for realizing
                 dag calculus on multicore hardware and prove that the
                 proposed techniques are consistent with the semantics.
                 Finally, we present an implementation of the calculus
                 and evaluate it empirically by comparing its
                 performance to highly optimized code from prior work.
                 The results show that the calculus is expressive and
                 that it competes well with, and sometimes outperforms,
                 the state of the art.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Borgstrom:2016:LCF,
  author =       "Johannes Borgstr{\"o}m and Ugo {Dal Lago} and Andrew
                 D. Gordon and Marcin Szymczak",
  title =        "A lambda-calculus foundation for universal
                 probabilistic programming",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "33--46",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951942",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We develop the operational semantics of an untyped
                 probabilistic \lambda -calculus with continuous
                 distributions, and both hard and soft constraints,as a
                 foundation for universal probabilistic programming
                 languages such as Church, Anglican, and Venture. Our
                 first contribution is to adapt the classic operational
                 semantics of \lambda -calculus to a continuous setting
                 via creating a measure space on terms and defining
                 step-indexed approximations. We prove equivalence of
                 big-step and small-step formulations of this
                 distribution-based semantics. To move closer to
                 inference techniques, we also define the sampling-based
                 semantics of a term as a function from a trace of
                 random samples to a value. We show that the
                 distribution induced by integration over the space of
                 traces equals the distribution-based semantics. Our
                 second contribution is to formalize the implementation
                 technique of trace Markov chain Monte Carlo (MCMC) for
                 our calculus and to show its correctness. A key step is
                 defining sufficient conditions for the distribution
                 induced by trace MCMC to converge to the
                 distribution-based semantics. To the best of our
                 knowledge, this is the first rigorous correctness proof
                 for trace MCMC for a higher-order functional language,
                 or for a language with soft constraints.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Ismail:2016:DPD,
  author =       "Wazim Mohammed Ismail and Chung-chieh Shan",
  title =        "Deriving a probability density calculator (functional
                 pearl)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "47--59",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951922",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Given an expression that denotes a probability
                 distribution, often we want a corresponding density
                 function, to use in probabilistic inference.
                 Fortunately, the task of finding a density has been
                 automated. It turns out that we can derive a
                 compositional procedure for finding a density, by
                 equational reasoning about integrals, starting with the
                 mathematical specification of what a density is.
                 Moreover, the density found can be run as an estimation
                 algorithm, as well as simplified as an exact formula to
                 improve the estimate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Tan:2016:NVC,
  author =       "Yong Kiam Tan and Magnus O. Myreen and Ramana Kumar
                 and Anthony Fox and Scott Owens and Michael Norrish",
  title =        "A new verified compiler backend for {CakeML}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "60--73",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951924",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We have developed and mechanically verified a new
                 compiler backend for CakeML. Our new compiler features
                 a sequence of intermediate languages that allows it to
                 incrementally compile away high-level features and
                 enables verification at the right levels of semantic
                 detail. In this way, it resembles mainstream
                 (unverified) compilers for strict functional languages.
                 The compiler supports efficient curried multi-argument
                 functions, configurable data representations,
                 exceptions that unwind the call stack, register
                 allocation, and more. The compiler targets several
                 architectures: x86-64, ARMv6, ARMv8, MIPS-64, and
                 RISC-V. In this paper, we present the overall structure
                 of the compiler, including its 12 intermediate
                 languages, and explain how everything fits together. We
                 focus particularly on the interaction between the
                 verification of the register allocator and the garbage
                 collector, and memory representations. The entire
                 development has been carried out within the HOL4
                 theorem prover.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Downen:2016:SCC,
  author =       "Paul Downen and Luke Maurer and Zena M. Ariola and
                 Simon Peyton Jones",
  title =        "Sequent calculus as a compiler intermediate language",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "74--88",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951931",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The $ \lambda $ -calculus is popular as an
                 intermediate language for practical compilers. But in
                 the world of logic it has a lesser-known twin, born at
                 the same time, called the sequent calculus. Perhaps
                 that would make for a good intermediate language, too?
                 To explore this question we designed Sequent Core, a
                 practically-oriented core calculus based on the sequent
                 calculus, and used it to re-implement a substantial
                 chunk of the Glasgow Haskell Compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{OConnor:2016:RTR,
  author =       "Liam O'Connor and Zilin Chen and Christine Rizkallah
                 and Sidney Amani and Japheth Lim and Toby Murray and
                 Yutaka Nagashima and Thomas Sewell and Gerwin Klein",
  title =        "Refinement through restraint: bringing down the cost
                 of verification",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "89--102",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951940",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a framework aimed at significantly reducing
                 the cost of verifying certain classes of systems
                 software, such as file systems. Our framework allows
                 for equational reasoning about systems code written in
                 our new language, Cogent. Cogent is a restricted,
                 polymorphic, higher-order, and purely functional
                 language with linear types and without the need for a
                 trusted runtime or garbage collector. Linear types
                 allow us to assign two semantics to the language: one
                 imperative, suitable for efficient C code generation;
                 and one functional, suitable for equational reasoning
                 and verification. As Cogent is a restricted language,
                 it is designed to easily interoperate with existing C
                 functions and to connect to existing C verification
                 frameworks. Our framework is based on certifying
                 compilation: For a well-typed Cogent program, our
                 compiler produces C code, a high-level shallow
                 embedding of its semantics in Isabelle/HOL, and a proof
                 that the C code correctly refines this embedding. Thus
                 one can reason about the full semantics of real-world
                 systems code productively and equationally, while
                 retaining the interoperability and leanness of C. The
                 compiler certificate is a series of language-level
                 proofs and per-program translation validation phases,
                 combined into one coherent top-level theorem in
                 Isabelle/HOL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{New:2016:FAC,
  author =       "Max S. New and William J. Bowman and Amal Ahmed",
  title =        "Fully abstract compilation via universal embedding",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "103--116",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951941",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A fully abstract compiler guarantees that two source
                 components are observationally equivalent in the source
                 language if and only if their translations are
                 observationally equivalent in the target. Full
                 abstraction implies the translation is secure:
                 target-language attackers can make no more observations
                 of a compiled component than a source-language attacker
                 interacting with the original source component. Proving
                 full abstraction for realistic compilers is challenging
                 because realistic target languages contain features
                 (such as control effects) unavailable in the source,
                 while proofs of full abstraction require showing that
                 every target context to which a compiled component may
                 be linked can be back-translated to a behaviorally
                 equivalent source context. We prove the first full
                 abstraction result for a translation whose target
                 language contains exceptions, but the source does not.
                 Our translation---specifically, closure conversion of
                 simply typed $ \lambda $-calculus with recursive
                 types---uses types at the target level to ensure that a
                 compiled component is never linked with attackers that
                 have more distinguishing power than source-level
                 attackers. We present a new back-translation technique
                 based on a shallow embedding of the target language
                 into the source language at a dynamic type. Then
                 boundaries are inserted that mediate terms between the
                 untyped embedding and the strongly-typed source. This
                 technique allows back-translating non-terminating
                 programs, target features that are untypeable in the
                 source, and well-bracketed effects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Dimoulas:2016:OLP,
  author =       "Christos Dimoulas and Max S. New and Robert Bruce
                 Findler and Matthias Felleisen",
  title =        "{Oh Lord}, please don't let contracts be misunderstood
                 (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "117--131",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951930",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Contracts feel misunderstood, especially those with a
                 higher-order soul. While software engineers appreciate
                 contracts as tools for articulating the interface
                 between components, functional programmers desperately
                 search for their types and meaning, completely
                 forgetting about their pragmatics. This gem presents a
                 novel analysis of contract systems. Applied to the
                 higher-order kind, this analysis reveals their large
                 and clearly unappreciated software engineering
                 potential. Three sample applications illustrate where
                 this kind of exploration may lead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Cicek:2016:TTI,
  author =       "Ezgi {\c{C}}i{\c{c}}ek and Zoe Paraskevopoulou and
                 Deepak Garg",
  title =        "A type theory for incremental computational complexity
                 with control flow changes",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "132--145",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951950",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Incremental computation aims to speed up re-runs of a
                 program after its inputs have been modified slightly.
                 It works by recording a trace of the program's first
                 run and propagating changes through the trace in
                 incremental runs, trying to re-use as much of the
                 original trace as possible. The recent work CostIt is a
                 type and effect system to establish the time complexity
                 of incremental runs of a program, as a function of
                 input changes. However, CostIt is limited in two ways.
                 First, it prohibits input changes that influence
                 control flow. This makes it impossible to type programs
                 that, for instance, branch on inputs that may change.
                 Second, the soundness of CostIt is proved relative to
                 an abstract cost semantics, but it is unclear how the
                 semantics can be realized. In this paper, we address
                 both these limitations. We present DuCostIt, a
                 re-design of CostIt, that combines reasoning about
                 costs of change propagation and costs of from-scratch
                 evaluation. The latter lifts the restriction on control
                 flow changes. To obtain the type system, we refine Flow
                 Caml, a type system for information flow analysis, with
                 cost effects. Additionally, we inherit from CostIt
                 index refinements to track data structure sizes and a
                 co-monadic type. Using a combination of binary and
                 unary step-indexed logical relations, we prove
                 DuCostIt's cost analysis sound relative to not only an
                 abstract cost semantics, but also a concrete semantics,
                 which is obtained by translation to an ML-like
                 language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Takeda:2016:CBE,
  author =       "Kotaro Takeda and Naoki Kobayashi and Kazuya Yaguchi
                 and Ayumi Shinohara",
  title =        "Compact bit encoding schemes for simply-typed
                 lambda-terms",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "146--157",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951918",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We consider the problem of how to compactly encode
                 simply-typed \lambda -terms into bit strings. The work
                 has been motivated by Kobayashi et al.'s recent work on
                 higher-order data compression, where data are encoded
                 as functional programs (or, \lambda -terms) that
                 generate them. To exploit its good compression power,
                 the compression scheme has to come with a method for
                 compactly encoding the \lambda -terms into bit strings.
                 To this end, we propose two type-based bit-encoding
                 schemes; the first one encodes a \lambda -term into a
                 sequence of symbols by using type information, and then
                 applies arithmetic coding to convert the sequence to a
                 bit string. The second one is more sophisticated; we
                 prepare a context-free grammar (CFG) that describes
                 only well-typed terms, and then use a variation of
                 arithmetic coding specialized for the CFG. We have
                 implemented both schemes and confirmed that they often
                 output more compact codes than previous bit encoding
                 schemes for \lambda -terms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Mu:2016:QGO,
  author =       "Shin-Cheng Mu and Yu-Hsi Chiang and Yu-Han Lyu",
  title =        "Queueing and glueing for optimal partitioning
                 (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "158--167",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951923",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The queueing-glueing algorithm is the nickname we give
                 to an algorithmic pattern that provides amortised
                 linear time solutions to a number of optimal list
                 partition problems that have a peculiar property: at
                 various moments we know that two of three candidate
                 solutions could be optimal. The algorithm works by
                 keeping a queue of lists, glueing them from one end,
                 while chopping from the other end, hence the name. We
                 give a formal derivation of the algorithm, and
                 demonstrate it with several non-trivial examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Christiansen:2016:ASP,
  author =       "Jan Christiansen and Nikita Danilenko and Sandra
                 Dylus",
  title =        "All sorts of permutations (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "168--179",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951949",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The combination of non-determinism and sorting is
                 mostly associated with permutation sort, a sorting
                 algorithm that is not very useful for sorting and has
                 an awful running time. In this paper we look at the
                 combination of non-determinism and sorting in a
                 different light: given a sorting function, we apply it
                 to a non-deterministic predicate to gain a function
                 that enumerates permutations of the input list. We get
                 to the bottom of necessary properties of the sorting
                 algorithms and predicates in play as well as discuss
                 variations of the modelled non-determinism. On top of
                 that, we formulate and prove a theorem stating that no
                 matter which sorting function we use, the corresponding
                 permutation function enumerates all permutations of the
                 input list. We use free theorems, which are derived
                 from the type of a function alone, to prove the
                 statement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Serrano:2016:GH,
  author =       "Manuel Serrano and Vincent Prunet",
  title =        "A glimpse of {Hopjs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "180--192",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951916",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hop.js is a multitier programming environment for
                 JavaScript. It allows a single JavaScript program to
                 describe the client-side and the server-side components
                 of a web application. Its runtime environment ensures
                 consistent executions of the application on the server
                 and on the client. This paper overviews the Hop.js
                 design. It shows the JavaScript extensions that makes
                 it possible to conceive web applications globally. It
                 presents how Hop.js interacts with the outside world.
                 It also briefly presents the Hop.js implementation. It
                 presents the Hop.js web server implementation, the
                 handling of server-side parallelism, and the JavaScript
                 and HTML compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Sergey:2016:ERG,
  author =       "Ilya Sergey",
  title =        "Experience report: growing and shrinking polygons for
                 random testing of computational geometry algorithms",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "193--199",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951927",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper documents our experience of adapting and
                 using the QuickCheck-style approach for extensive
                 randomised property-based testing of computational
                 geometry algorithms. The need in rigorous evaluation of
                 computational geometry procedures has naturally arisen
                 in our quest of organising a medium-size programming
                 contest for second year university students-an
                 experiment we conducted as an attempt to introduce them
                 to computational geometry. The main effort in
                 organising the event was implementation of a solid
                 infrastructure for testing and ranking solutions. For
                 this, we employed functional programming techniques.
                 The choice of the language and the paradigm made it
                 possible for us to engineer, from scratch and in a very
                 short period of time, a series of robust geometric
                 primitives and algorithms, as well as implement a
                 scalable framework for their randomised testing. We
                 describe the main insights, enabling efficient random
                 testing of geometric procedures, and report on our
                 experience of using the testing framework, which helped
                 us to detect and fix a number of issues not just in our
                 programming artefacts, but also in the published
                 algorithms we had implemented.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Emoto:2016:TLV,
  author =       "Kento Emoto and Kiminori Matsuzaki and Zhenjiang Hu
                 and Akimasa Morihata and Hideya Iwasaki",
  title =        "Think like a vertex, behave like a function! {A}
                 functional {DSL} for vertex-centric big graph
                 processing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "200--213",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951938",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The vertex-centric programming model, known as ``think
                 like a vertex'', is being used more and more to support
                 various big graph processing methods through iterative
                 supersteps that execute in parallel a user-defined
                 vertex program over each vertex of a graph. However,
                 the imperative and message-passing style of existing
                 systems makes defining a vertex program unintuitive. In
                 this paper, we show that one can benefit more from
                 ``Thinking like a vertex'' by ``Behaving like a
                 function'' rather than ``Acting like a procedure'' with
                 full use of side effects and explicit control of
                 message passing, state, and termination. We propose a
                 functional approach to vertex-centric graph processing
                 in which the computation at every vertex is abstracted
                 as a higher-order function and present Fregel, a new
                 domain-specific language. Fregel has clear functional
                 semantics, supports declarative description of vertex
                 computation, and can be automatically translated into
                 Pregel, an emerging imperative-style distributed graph
                 processing framework, and thereby achieve promising
                 performance. Experimental results for several typical
                 examples show the promise of this functional
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Arntzenius:2016:DFD,
  author =       "Michael Arntzenius and Neelakantan R. Krishnaswami",
  title =        "{Datafun}: a functional {Datalog}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "214--227",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951948",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Datalog may be considered either an unusually powerful
                 query language or a carefully limited logic programming
                 language. Datalog is declarative, expressive, and
                 optimizable, and has been applied successfully in a
                 wide variety of problem domains. However, most
                 use-cases require extending Datalog in an
                 application-specific manner. In this paper we define
                 Datafun, an analogue of Datalog supporting higher-order
                 functional programming. The key idea is to track
                 monotonicity with types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Seidel:2016:DWS,
  author =       "Eric L. Seidel and Ranjit Jhala and Westley Weimer",
  title =        "Dynamic witnesses for static type errors (or,
                 ill-typed programs usually go wrong)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "228--242",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951915",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Static type errors are a common stumbling block for
                 newcomers to typed functional languages. We present a
                 dynamic approach to explaining type errors by
                 generating counterexample witness inputs that
                 illustrate how an ill-typed program goes wrong. First,
                 given an ill-typed function, we symbolically execute
                 the body to synthesize witness values that make the
                 program go wrong. We prove that our procedure
                 synthesizes general witnesses in that if a witness is
                 found, then for all inhabited input types, there exist
                 values that can make the function go wrong. Second, we
                 show how to extend the above procedure to produce a
                 reduction graph that can be used to interactively
                 visualize and debug witness executions. Third, we
                 evaluate the coverage of our approach on two data sets
                 comprising over 4,500 ill-typed student programs. Our
                 technique is able to generate witnesses for 88\% of the
                 programs, and our reduction graph yields small
                 counterexamples for 81\% of the witnesses. Finally, we
                 evaluate whether our witnesses help students understand
                 and fix type errors, and find that students presented
                 with our witnesses show a greater understanding of type
                 errors than those presented with a standard error
                 message.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Watanabe:2016:ADF,
  author =       "Keiichi Watanabe and Ryosuke Sato and Takeshi Tsukada
                 and Naoki Kobayashi",
  title =        "Automatically disproving fair termination of
                 higher-order functional programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "243--255",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951919",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose an automated method for disproving fair
                 termination of higher-order functional programs, which
                 is complementary to Murase et al.'s recent method for
                 proving fair termination. A program is said to be fair
                 terminating if it has no infinite execution trace that
                 satisfies a given fairness constraint. Fair termination
                 is an important property because program verification
                 problems for arbitrary \omega -regular temporal
                 properties can be transformed to those of fair
                 termination. Our method reduces the problem of
                 disproving fair termination to higher-order model
                 checking by using predicate abstraction and CEGAR.
                 Given a program, we convert it to an abstract program
                 that generates an approximation of the (possibly
                 infinite) execution traces of the original program, so
                 that the original program has a fair infinite execution
                 trace if the tree generated by the abstract program
                 satisfies a certain property. The method is a
                 non-trivial extension of Kuwahara et al.'s method for
                 disproving plain termination.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Jung:2016:HOG,
  author =       "Ralf Jung and Robbert Krebbers and Lars Birkedal and
                 Derek Dreyer",
  title =        "Higher-order ghost state",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "256--269",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951943",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The development of concurrent separation logic (CSL)
                 has sparked a long line of work on modular verification
                 of sophisticated concurrent programs. Two of the most
                 important features supported by several existing
                 extensions to CSL are higher-order quantification and
                 custom ghost state. However, none of the logics that
                 support both of these features reap the full potential
                 of their combination. In particular, none of them
                 provide general support for a feature we dub
                 ``higher-order ghost state'': the ability to store
                 arbitrary higher-order separation-logic predicates in
                 ghost variables. In this paper, we propose higher-order
                 ghost state as a interesting and useful extension to
                 CSL, which we formalize in the framework of Jung et
                 al.'s recently developed Iris logic. To justify its
                 soundness, we develop a novel algebraic structure
                 called CMRAs (``cameras''), which can be thought of as
                 ``step-indexed partial commutative monoids''. Finally,
                 we show that Iris proofs utilizing higher-order ghost
                 state can be effectively formalized in Coq, and discuss
                 the challenges we faced in formalizing them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Cockx:2016:UEP,
  author =       "Jesper Cockx and Dominique Devriese and Frank
                 Piessens",
  title =        "Unifiers as equivalences: proof-relevant unification
                 of dependently typed data",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "270--283",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951917",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dependently typed languages such as Agda, Coq and
                 Idris use a syntactic first-order unification algorithm
                 to check definitions by dependent pattern matching.
                 However, these algorithms don't adequately consider the
                 types of the terms being unified, leading to various
                 unintended results. As a consequence, they require ad
                 hoc restrictions to preserve soundness, but this makes
                 them very hard to prove correct, modify, or extend.
                 This paper proposes a framework for reasoning formally
                 about unification in a dependently typed setting. In
                 this framework, unification rules compute not just a
                 unifier but also a corresponding correctness proof in
                 the form of an equivalence between two sets of
                 equations. By rephrasing the standard unification rules
                 in a proof-relevant manner, they are guaranteed to
                 preserve soundness of the theory. In addition, it
                 enables us to safely add new rules that can exploit the
                 dependencies between the types of equations. Using our
                 framework, we reimplemented the unification algorithm
                 used by Agda. As a result, we were able to replace
                 previous ad hoc restrictions with formally verified
                 unification rules, fixing a number of bugs in the
                 process. We are convinced this will also enable the
                 addition of new and interesting unification rules in
                 the future, without compromising soundness along the
                 way.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Christiansen:2016:ERE,
  author =       "David Christiansen and Edwin Brady",
  title =        "Elaborator reflection: extending {Idris} in {Idris}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "284--297",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951932",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many programming languages and proof assistants are
                 defined by elaboration from a high-level language with
                 a great deal of implicit information to a highly
                 explicit core language. In many advanced languages,
                 these elaboration facilities contain powerful tools for
                 program construction, but these tools are rarely
                 designed to be repurposed by users. We describe
                 elaborator reflection, a paradigm for metaprogramming
                 in which the elaboration machinery is made directly
                 available to metaprograms, as well as a concrete
                 realization of elaborator reflection in Idris, a
                 functional language with full dependent types. We
                 demonstrate the applicability of Idris's reflected
                 elaboration framework to a number of realistic
                 problems, we discuss the motivation for the specific
                 features of its design, and we explore the broader
                 meaning of elaborator reflection as it can relate to
                 other languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Dagand:2016:PTE,
  author =       "Pierre-Evariste Dagand and Nicolas Tabareau and
                 {\'E}ric Tanter",
  title =        "Partial type equivalences for verified dependent
                 interoperability",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "298--310",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951933",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Full-spectrum dependent types promise to enable the
                 development of correct-by-construction software.
                 However, even certified software needs to interact with
                 simply-typed or untyped programs, be it to perform
                 system calls, or to use legacy libraries. Trading
                 static guarantees for runtime checks, the dependent
                 interoperability framework provides a mechanism by
                 which simply-typed values can safely be coerced to
                 dependent types and, conversely, dependently-typed
                 programs can defensively be exported to a simply-typed
                 application. In this paper, we give a semantic account
                 of dependent interoperability. Our presentation relies
                 on and is guided by a pervading notion of type
                 equivalence, whose importance has been emphasized in
                 recent work on homotopy type theory. Specifically, we
                 develop the notion of partial type equivalences as a
                 key foundation for dependent interoperability. Our
                 framework is developed in Coq; it is thus constructive
                 and verified in the strictest sense of the terms. Using
                 our library, users can specify domain-specific partial
                 equivalences between data structures. Our library then
                 takes care of the (sometimes, heavy) lifting that leads
                 to interoperable programs. It thus becomes possible, as
                 we shall illustrate, to internalize and hand-tune the
                 extraction of dependently-typed programs to
                 interoperable OCaml programs within Coq itself.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Darais:2016:CGC,
  author =       "David Darais and David {Van Horn}",
  title =        "Constructive {Galois} connections: taming the {Galois}
                 connection framework for mechanized metatheory",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "311--324",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951934",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Galois connections are a foundational tool for
                 structuring abstraction in semantics and their use lies
                 at the heart of the theory of abstract interpretation.
                 Yet, mechanization of Galois connections remains
                 limited to restricted modes of use, preventing their
                 general application in mechanized metatheory and
                 certified programming. This paper presents constructive
                 Galois connections, a variant of Galois connections
                 that is effective both on paper and in proof
                 assistants; is complete with respect to a large subset
                 of classical Galois connections; and enables more
                 general reasoning principles, including the
                 ``calculational'' style advocated by Cousot. To design
                 constructive Galois connection we identify a restricted
                 mode of use of classical ones which is both general and
                 amenable to mechanization in dependently-typed
                 functional programming languages. Crucial to our
                 metatheory is the addition of monadic structure to
                 Galois connections to control a ``specification
                 effect''. Effectful calculations may reason
                 classically, while pure calculations have extractable
                 computational content. Explicitly moving between the
                 worlds of specification and implementation is enabled
                 by our metatheory. To validate our approach, we provide
                 two case studies in mechanizing existing proofs from
                 the literature: one uses calculational abstract
                 interpretation to design a static analyzer, the other
                 forms a semantic basis for gradual typing. Both
                 mechanized proofs closely follow their original
                 paper-and-pencil counterparts, employ reasoning
                 principles not captured by previous mechanization
                 approaches, support the extraction of verified
                 algorithms, and are novel.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Blazy:2016:AMF,
  author =       "Sandrine Blazy and Vincent Laporte and David
                 Pichardie",
  title =        "An abstract memory functor for verified {C} static
                 analyzers",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "325--337",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951937",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Abstract interpretation provides advanced techniques
                 to infer numerical invariants on programs. There is an
                 abundant literature about numerical abstract domains
                 that operate on scalar variables. This work deals with
                 lifting these techniques to a realistic C memory model.
                 We present an abstract memory functor that takes as
                 argument any standard numerical abstract domain, and
                 builds a memory abstract domain that finely tracks
                 properties about memory contents, taking into account
                 union types, pointer arithmetic and type casts. This
                 functor is implemented and verified inside the Coq
                 proof assistant with respect to the CompCert compiler
                 memory model. Using the Coq extraction mechanism, it is
                 fully executable and used by the Verasco C static
                 analyzer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{McDonell:2016:GTS,
  author =       "Trevor L. McDonell and Timothy A. K. Zakian and Matteo
                 Cimini and Ryan R. Newton",
  title =        "Ghostbuster: a tool for simplifying and converting
                 {GADTs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "338--350",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951914",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Generalized Algebraic Dataypes, or simply GADTs, can
                 encode non-trivial properties in the types of the
                 constructors. Once such properties are encoded in a
                 datatype, however, all code manipulating that datatype
                 must provide proof that it maintains these properties
                 in order to typecheck. In this paper, we take a step
                 towards gradualizing these obligations. We introduce a
                 tool, Ghostbuster, that produces simplified versions of
                 GADTs which elide selected type parameters, thereby
                 weakening the guarantees of the simplified datatype in
                 exchange for reducing the obligations necessary to
                 manipulate it. Like ornaments, these simplified
                 datatypes preserve the recursive structure of the
                 original, but unlike ornaments we focus on
                 information-preserving bidirectional transformations.
                 Ghostbuster generates type-safe conversion functions
                 between the original and simplified datatypes, which we
                 prove are the identity function when composed. We
                 evaluate a prototype tool for Haskell against thousands
                 of GADTs found on the Hackage package database,
                 generating simpler Haskell'98 datatypes and round-trip
                 conversion functions between the two.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Thibodeau:2016:ICT,
  author =       "David Thibodeau and Andrew Cave and Brigitte Pientka",
  title =        "Indexed codata types",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "351--363",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951929",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Indexed data types allow us to specify and verify many
                 interesting invariants about finite data in a general
                 purpose programming language. In this paper we
                 investigate the dual idea: indexed codata types, which
                 allow us to describe data-dependencies about infinite
                 data structures. Unlike finite data which is defined by
                 constructors, we define infinite data by observations.
                 Dual to pattern matching on indexed data which may
                 refine the type indices, we define copattern matching
                 on indexed codata where type indices guard observations
                 we can make. Our key technical contributions are
                 three-fold: first, we extend Levy's call-by-push value
                 language with support for indexed (co)data and deep
                 (co)pattern matching; second, we provide a clean
                 foundation for dependent (co)pattern matching using
                 equality constraints; third, we describe a small-step
                 semantics using a continuation-based abstract machine,
                 define coverage for indexed (co)patterns, and prove
                 type safety. This is an important step towards building
                 a foundation where (co)data type definitions and
                 dependent types can coexist.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Oliveira:2016:DIT,
  author =       "Bruno C. d. S. Oliveira and Zhiyuan Shi and Jo{\~a}o
                 Alpuim",
  title =        "Disjoint intersection types",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "364--377",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951945",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dunfield showed that a simply typed core calculus with
                 intersection types and a merge operator is able to
                 capture various programming language features. While
                 his calculus is type-safe, it is not coherent:
                 different derivations for the same expression can
                 elaborate to expressions that evaluate to different
                 values. The lack of coherence is an important
                 disadvantage for adoption of his core calculus in
                 implementations of programming languages, as the
                 semantics of the programming language becomes
                 implementation-dependent. This paper presents \lambda
                 _i: a coherent and type-safe calculus with a form of
                 intersection types and a merge operator. Coherence is
                 achieved by ensuring that intersection types are
                 disjoint and programs are sufficiently annotated to
                 avoid type ambiguity. We propose a definition of
                 disjointness where two types A and B are disjoint only
                 if certain set of types are common supertypes of A and
                 B. We investigate three different variants of \lambda
                 _i, with three variants of disjointness. In the
                 simplest variant, which does not allow \top types, two
                 types are disjoint if they do not share any common
                 supertypes at all. The other two variants introduce
                 \top types and refine the notion of disjointness to
                 allow two types to be disjoint when the only the set of
                 common supertypes are top-like. The difference between
                 the two variants with \top types is on the definition
                 of top-like types, which has an impact on which types
                 are allowed on intersections. We present a type system
                 that prevents intersection types that are not disjoint,
                 as well as an algorithmic specifications to determine
                 whether two types are disjoint for all three
                 variants.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Castagna:2016:STT,
  author =       "Giuseppe Castagna and Tommaso Petrucciani and Kim
                 Nguy{\~{\^e}}n",
  title =        "Set-theoretic types for polymorphic variants",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "378--391",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951928",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Polymorphic variants are a useful feature of the OCaml
                 language whose current definition and implementation
                 rely on kinding constraints to simulate a subtyping
                 relation via unification. This yields an awkward
                 formalization and results in a type system whose
                 behaviour is in some cases unintuitive and/or unduly
                 restrictive. In this work, we present an alternative
                 formalization of polymorphic variants, based on
                 set-theoretic types and subtyping, that yields a
                 cleaner and more streamlined system. Our formalization
                 is more expressive than the current one (it types more
                 programs while preserving type safety), it can
                 internalize some meta-theoretic properties, and it
                 removes some pathological cases of the current
                 implementation resulting in a more intuitive and, thus,
                 predictable type system. More generally, this work
                 shows how to add full-fledged union types to functional
                 languages of the ML family that usually rely on the
                 Hindley-Milner type system. As an aside, our system
                 also improves the theory of semantic subtyping, notably
                 by proving completeness for the type reconstruction
                 algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Raghunathan:2016:HMM,
  author =       "Ram Raghunathan and Stefan K. Muller and Umut A. Acar
                 and Guy Blelloch",
  title =        "Hierarchical memory management for parallel programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "392--406",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951935",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "An important feature of functional programs is that
                 they are parallel by default. Implementing an efficient
                 parallel functional language, however, is a major
                 challenge, in part because the high rate of allocation
                 and freeing associated with functional programs
                 requires an efficient and scalable memory manager. In
                 this paper, we present a technique for parallel memory
                 management for strict functional languages with nested
                 parallelism. At the highest level of abstraction, the
                 approach consists of a technique to organize memory as
                 a hierarchy of heaps, and an algorithm for performing
                 automatic memory reclamation by taking advantage of a
                 disentanglement property of parallel functional
                 programs. More specifically, the idea is to assign to
                 each parallel task its own heap in memory and organize
                 the heaps in a hierarchy/tree that mirrors the
                 hierarchy of tasks. We present a nested-parallel
                 calculus that specifies hierarchical heaps and prove in
                 this calculus a disentanglement property, which
                 prohibits a task from accessing objects allocated by
                 another task that might execute in parallel. Leveraging
                 the disentanglement property, we present a garbage
                 collection technique that can operate on any subtree in
                 the memory hierarchy concurrently as other tasks
                 (and/or other collections) proceed in parallel. We
                 prove the safety of this collector by formalizing it in
                 the context of our parallel calculus. In addition, we
                 describe how the proposed techniques can be implemented
                 on modern shared-memory machines and present a
                 prototype implementation as an extension to MLton, a
                 high-performance compiler for the Standard ML language.
                 Finally, we evaluate the performance of this
                 implementation on a number of parallel benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Gilray:2016:ACP,
  author =       "Thomas Gilray and Michael D. Adams and Matthew Might",
  title =        "Allocation characterizes polyvariance: a unified
                 methodology for polyvariant control-flow analysis",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "407--420",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951936",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The polyvariance of a static analysis is the degree to
                 which it structurally differentiates approximations of
                 program values. Polyvariant techniques come in a number
                 of different flavors that represent alternative
                 heuristics for managing the trade-off an analysis
                 strikes between precision and complexity. For example,
                 call sensitivity supposes that values will tend to
                 correlate with recent call sites, object sensitivity
                 supposes that values will correlate with the allocation
                 points of related objects, the Cartesian product
                 algorithm supposes correlations between the values of
                 arguments to the same function, and so forth. In this
                 paper, we describe a unified methodology for
                 implementing and understanding polyvariance in a
                 higher-order setting (i.e., for control-flow analyses).
                 We do this by extending the method of abstracting
                 abstract machines (AAM), a systematic approach to
                 producing an abstract interpretation of
                 abstract-machine semantics. AAM eliminates recursion
                 within a language's semantics by passing around an
                 explicit store, and thus places importance on the
                 strategy an analysis uses for allocating abstract
                 addresses within the abstract heap or store. We build
                 on AAM by showing that the design space of possible
                 abstract allocators exactly and uniquely corresponds to
                 the design space of polyvariant strategies. This allows
                 us to both unify and generalize polyvariance as tunings
                 of a single function. Changes to the behavior of this
                 function easily recapitulate classic styles of analysis
                 and produce novel variations, combinations of
                 techniques, and fundamentally new techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Ueno:2016:FCG,
  author =       "Katsuhiro Ueno and Atsushi Ohori",
  title =        "A fully concurrent garbage collector for functional
                 programs on multicore processors",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "421--433",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951944",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a concurrent garbage collection
                 method for functional programs running on a multicore
                 processor. It is a concurrent extension of our
                 bitmap-marking non-moving collector with Yuasa's
                 snapshot-at-the-beginning strategy. Our collector is
                 unobtrusive in the sense of the Doligez-Leroy-Gonthier
                 collector; the collector does not stop any mutator
                 thread nor does it force them to synchronize globally.
                 The only critical sections between a mutator and the
                 collector are the code to enqueue/dequeue a 32 kB
                 allocation segment to/from a global segment list and
                 the write barrier code to push an object pointer onto
                 the collector's stack. Most of these data structures
                 can be implemented in standard lock-free data
                 structures. This achieves both efficient allocation and
                 unobtrusive collection in a multicore system. The
                 proposed method has been implemented in SML\#, a
                 full-scale Standard ML compiler supporting multiple
                 native threads on multicore CPUs. Our benchmark tests
                 show a drastically short pause time with reasonably low
                 overhead compared to the sequential bitmap-marking
                 collector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Lindley:2016:TBS,
  author =       "Sam Lindley and J. Garrett Morris",
  title =        "Talking bananas: structural recursion for session
                 types",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "434--447",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951921",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Session types provide static guarantees that
                 concurrent programs respect communication protocols. We
                 give a novel account of recursive session types in the
                 context of GV, a small concurrent extension of the
                 linear \lambda -calculus. We extend GV with recursive
                 types and catamorphisms, following the initial algebra
                 semantics of recursion, and show that doing so
                 naturally gives rise to recursive session types. We
                 show that this principled approach to recursion
                 resolves long-standing problems in the treatment of
                 duality for recursive session types. We characterize
                 the expressiveness of GV concurrency by giving a CPS
                 translation to (non-concurrent) \lambda -calculus and
                 proving that reduction in GV is simulated by full
                 reduction in \lambda -calculus. This shows that GV
                 remains terminating in the presence of positive
                 recursive types, and that such arguments extend to
                 other extensions of GV, such as polymorphism or
                 non-linear types, by appeal to normalization results
                 for sequential \lambda -calculi. We also show that GV
                 remains deadlock free and deterministic in the presence
                 of recursive types. Finally, we extend CP, a
                 session-typed process calculus based on linear logic,
                 with recursive types, and show that doing so preserves
                 the connection between reduction in GV and cut
                 elimination in CP.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Morris:2016:BBW,
  author =       "J. Garrett Morris",
  title =        "The best of both worlds: linear functional programming
                 without compromise",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "448--461",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951925",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a linear functional calculus with both the
                 safety guarantees expressible with linear types and the
                 rich language of combinators and composition provided
                 by functional programming. Unlike previous combinations
                 of linear typing and functional programming, we
                 compromise neither the linear side (for example, our
                 linear values are first-class citizens of the language)
                 nor the functional side (for example, we do not require
                 duplicate definitions of compositions for linear and
                 unrestricted functions). To do so, we must generalize
                 abstraction and application to encompass both linear
                 and unrestricted functions. We capture the typing of
                 the generalized constructs with a novel use of
                 qualified types. Our system maintains the metatheoretic
                 properties of the theory of qualified types, including
                 principal types and decidable type inference. Finally,
                 we give a formal basis for our claims of
                 expressiveness, by showing that evaluation respects
                 linearity, and that our language is a conservative
                 extension of existing functional calculi.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Thiemann:2016:CFS,
  author =       "Peter Thiemann and Vasco T. Vasconcelos",
  title =        "Context-free session types",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "462--475",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951926",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Session types describe structured communication on
                 heterogeneously typed channels at a high level. Their
                 tail-recursive structure imposes a protocol that can be
                 described by a regular language. The types of
                 transmitted values are drawn from the underlying
                 functional language, abstracting from the details of
                 serializing values of structured data types.
                 Context-free session types extend session types by
                 allowing nested protocols that are not restricted to
                 tail recursion. Nested protocols correspond to
                 deterministic context-free languages. Such protocols
                 are interesting in their own right, but they are
                 particularly suited to describe the low-level
                 serialization of tree-structured data in a type-safe
                 way. We establish the metatheory of context-free
                 session types, prove that they properly generalize
                 standard (two-party) session types, and take first
                 steps towards type checking by showing that type
                 equivalence is decidable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Gaboardi:2016:CEC,
  author =       "Marco Gaboardi and Shin-ya Katsumata and Dominic
                 Orchard and Flavien Breuvart and Tarmo Uustalu",
  title =        "Combining effects and coeffects via grading",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "476--489",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951939",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Effects and coeffects are two general, complementary
                 aspects of program behaviour. They roughly correspond
                 to computations which change the execution context
                 (effects) versus computations which make demands on the
                 context (coeffects). Effectful features include
                 partiality, non-determinism, input-output, state, and
                 exceptions. Coeffectful features include resource
                 demands, variable access, notions of linearity, and
                 data input requirements. The effectful or coeffectful
                 behaviour of a program can be captured and described
                 via type-based analyses, with fine grained information
                 provided by monoidal effect annotations and semiring
                 coeffects. Various recent work has proposed models for
                 such typed calculi in terms of graded (strong) monads
                 for effects and graded (monoidal) comonads for
                 coeffects. Effects and coeffects have been studied
                 separately so far, but in practice many computations
                 are both effectful and coeffectful, e.g., possibly
                 throwing exceptions but with resource requirements. To
                 remedy this, we introduce a new general calculus with a
                 combined effect-coeffect system. This can describe both
                 the changes and requirements that a program has on its
                 context, as well as interactions between these
                 effectful and coeffectful features of computation. The
                 effect-coeffect system has a denotational model in
                 terms of effect-graded monads and coeffect-graded
                 comonads where interaction is expressed via the novel
                 concept of graded distributive laws. This graded
                 semantics unifies the syntactic type theory with the
                 denotational model. We show that our calculus can be
                 instantiated to describe in a natural way various
                 different kinds of interaction between a program and
                 its evaluation context.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Pirog:2016:SDF,
  author =       "Maciej Pir{\'o}g and Nicolas Wu",
  title =        "String diagrams for free monads (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "490--501",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951947",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We show how one can reason about free monads using
                 their universal properties rather than any concrete
                 implementation. We introduce a graphical,
                 two-dimensional calculus tailor-made to accommodate
                 these properties.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Wade:2017:AVJ,
  author =       "April W. Wade and Prasad A. Kulkarni and Michael R.
                 Jantz",
  title =        "{AOT} vs. {JIT}: impact of profile data on code
                 quality",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "1--10",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081037",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Just-in-time (JIT) compilation during program
                 execution and ahead-of-time (AOT) compilation during
                 software installation are alternate techniques used by
                 managed language virtual machines (VM) to generate
                 optimized native code while simultaneously achieving
                 binary code portability and high execution performance.
                 Profile data collected by JIT compilers at run-time can
                 enable profile-guided optimizations (PGO) to customize
                 the generated native code to different program inputs.
                 AOT compilation removes the speed and energy overhead
                 of online profile collection and dynamic compilation,
                 but may not be able to achieve the quality and
                 performance of customized native code. The goal of this
                 work is to investigate and quantify the implications of
                 the AOT compilation model on the quality of the
                 generated native code for current VMs. First, we
                 quantify the quality of native code generated by the
                 two compilation models for a state-of-the-art (HotSpot)
                 Java VM. Second, we determine how the amount of profile
                 data collected affects the quality of generated code.
                 Third, we develop a mechanism to determine the accuracy
                 or similarity for different profile data for a given
                 program run, and investigate how the accuracy of
                 profile data affects its ability to effectively guide
                 PGOs. Finally, we categorize the profile data types in
                 our VM and explore the contribution of each such
                 category to performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Taylor:2017:AOO,
  author =       "Ben Taylor and Vicent Sanz Marco and Zheng Wang",
  title =        "Adaptive optimization for {OpenCL} programs on
                 embedded heterogeneous systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "11--20",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081040",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous multi-core architectures consisting of
                 CPUs and GPUs are commonplace in today's embedded
                 systems. These architectures offer potential for energy
                 efficient computing if the application task is mapped
                 to the right core. Realizing such potential is
                 challenging due to the complex and evolving nature of
                 hardware and applications. This paper presents an
                 automatic approach to map OpenCL kernels onto
                 heterogeneous multi-cores for a given optimization
                 criterion --- whether it is faster runtime, lower
                 energy consumption or a trade-off between them. This is
                 achieved by developing a machine learning based
                 approach to predict which processor to use to run the
                 OpenCL kernel and the host program, and at what
                 frequency the processor should operate. Instead of
                 hand-tuning a model for each optimization metric, we
                 use machine learning to develop a unified framework
                 that first automatically learns the optimization
                 heuristic for each metric off-line, then uses the
                 learned knowledge to schedule OpenCL kernels at runtime
                 based on code and runtime information of the program.
                 We apply our approach to a set of representative OpenCL
                 benchmarks and evaluate it on an ARM big.LITTLE mobile
                 platform. Our approach achieves over 93\% of the
                 performance delivered by a perfect predictor.We obtain,
                 on average, 1.2x, 1.6x, and 1.8x improvement
                 respectively for runtime, energy consumption and the
                 energy delay product when compared to a comparative
                 heterogeneous-aware OpenCL task mapping scheme.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Reiche:2017:AVI,
  author =       "Oliver Reiche and Christof Kobylko and Frank Hannig
                 and J{\"u}rgen Teich",
  title =        "Auto-vectorization for image processing {DSLs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "21--30",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081039",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The parallelization of programs and distributing their
                 workloads to multiple threads can be a challenging
                 task. In addition to multi-threading, harnessing vector
                 units in CPUs proves highly desirable. However,
                 employing vector units to speed up programs can be
                 quite tedious. Either a program developer solely relies
                 on the auto-vectorization capabilities of the compiler
                 or he manually applies vector intrinsics, which is
                 extremely error-prone, difficult to maintain, and not
                 portable at all. Based on whole-function vectorization,
                 a method to replace control flow with data flow, we
                 propose auto-vectorization techniques for image
                 processing DSLs in the context of source-to-source
                 compilation. The approach does not require the input to
                 be available in SSA form. Moreover, we formulate
                 constraints under which the vectorization analysis and
                 code transformations may be greatly simplified in the
                 context of image processing DSLs. As part of our
                 methodology, we present control flow to data flow
                 transformation as a source-to-source translation.
                 Moreover, we propose a method to efficiently analyze
                 algorithms with mixed bit-width data types to determine
                 the optimal SIMD width, independently of the target
                 instruction set. The techniques are integrated into an
                 open source DSL framework. Subsequently, the
                 vectorization capabilities are compared to a variety of
                 existing state-of-the-art C/C++ compilers. A geometric
                 mean speedup of up to 3.14 is observed for benchmarks
                 taken from ISPC and image processing, compared to
                 non-vectorized executions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Fu:2017:DTS,
  author =       "Sheng-Yu Fu and Ding-Yong Hong and Yu-Ping Liu and
                 Jan-Jan Wu and Wei-Chung Hsu",
  title =        "Dynamic translation of structured Loads\slash Stores
                 and register mapping for architectures with {SIMD}
                 extensions",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "31--40",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081029",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "More and more modern processors have been supporting
                 non-contiguous SIMD data accesses. However, translating
                 such instructions has been overlooked in the Dynamic
                 Binary Translation (DBT) area. For example, in the
                 popular QEMU dynamic binary translator, guest memory
                 instructions with strides are emulated by a sequence of
                 scalar instructions, leaving a significant room for
                 performance improvement when the host machines have
                 SIMD instructions available. Structured loads/stores,
                 such as VLDn/VSTn in ARM NEON, are one type of strided
                 SIMD data access instructions. They are widely used in
                 signal processing, multimedia, mathematical and 2D
                 matrix transposition applications. Efficient
                 translation of such structured loads/stores is a
                 critical issue when migrating ARM executables to other
                 ISAs. However, it is quite challenging since not only
                 the translation of structured loads/stores is not
                 trivial, but also the difference between guest and host
                 register configurations must be taken into
                 consideration. In this work, we present the design and
                 implementation of translating structured loads/stores
                 in DBT, including target code generation as well as
                 efficient SIMD register mapping. Our proposed register
                 mapping mechanisms are not limited to handling
                 structured loads/stores, they can be extended to deal
                 with normal SIMD instructions. On a set of OpenCV
                 benchmarks, our QEMU-based system has achieved a
                 maximum speedup of 5.41x, with an average improvement
                 of 2.93x. On a set of BLAS benchmarks, our system has
                 also obtained a maximum speedup of 2.19x and an average
                 improvement of 1.63x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Jiang:2017:OFU,
  author =       "Weiwen Jiang and Edwin H.-M. Sha and Qingfeng Zhuge
                 and Hailiang Dong and Xianzhang Chen",
  title =        "Optimal functional unit assignment and voltage
                 selection for pipelined {MPSoC} with guaranteed
                 probability on time performance",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "41--50",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081036",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Pipelined heterogeneous multiprocessor system-on-chip
                 (MPSoC) can provide high throughput for streaming
                 applications. In the design of such systems, time
                 performance and system cost are the most concerning
                 issues. By analyzing runtime behaviors of benchmarks in
                 real-world platforms, we find that execution times of
                 tasks are not fixed but spread with probabilities. In
                 terms of this feature, we model execution times of
                 tasks as random variables. In this paper, we study how
                 to design high-performance and low-cost MPSoC systems
                 to execute a set of such tasks with data dependencies
                 in a pipelined fashion. Our objective is to obtain the
                 optimal functional unit assignment and voltage
                 selection for the pipelined MPSoC systems, such that
                 the system cost is minimized while timing constraints
                 can be met with a given guaranteed probability. For
                 each required probability, our proposed algorithm can
                 efficiently obtain the optimal solution. Experiments
                 show that other existing algorithms cannot find
                 feasible solutions in most cases, but ours can. Even
                 for those solutions that other algorithms can obtain,
                 ours can reach 30\% reductions in total cost compared
                 with others.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Lee:2017:IIP,
  author =       "Gyeongmin Lee and Seonyeong Heo and Bongjun Kim and
                 Jong Kim and Hanjun Kim",
  title =        "Integrated {IoT} programming with selective
                 abstraction",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "51--60",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081031",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The explosion of networked devices has driven a new
                 computing environment called the Internet of Things
                 (IoT), enabling various services such as home
                 automation and health monitoring. Despite the promising
                 applicability of the IoT, developing an IoT service is
                 challenging for programmers, because the programmers
                 should integrate multiple programmable devices and
                 heterogeneous third-party devices. Recent works have
                 proposed integrated programming platforms, but they
                 either require device-specific implementation for
                 third-party devices without any device abstraction, or
                 abstract all the devices to the standard interfaces
                 requiring unnecessary abstraction of programmable
                 devices. To integrate IoT devices with selective
                 abstraction, this work revisits the object oriented
                 programming (OOP) model, and proposes a new language
                 extension and its compiler-runtime framework, called
                 Esperanto. With three annotations that map each object
                 to its corresponding IoT device, the Esperanto language
                 allows programmers to integrate multiple programmable
                 devices into one OOP program and to abstract similar
                 third-party devices into their common ancestor classes.
                 Given the annotations, the Esperanto compiler
                 automatically partitions the integrated program into
                 multiple sub-programs for each programmable IoT device,
                 and inserts communication and synchronization code.
                 Moreover, for the ancestor classes, the Esperanto
                 runtime dynamically identifies connected third-party
                 devices, and links their corresponding descendent
                 objects. Compared to an existing approach on the
                 integrated IoT programming, Esperanto requires 33.3\%
                 fewer lines of code to implement 5 IoT services, and
                 reduces their response time by 44.8\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Zhang:2017:TSB,
  author =       "Min Zhang and Yunhui Ying",
  title =        "Towards {SMT-based} {LTL} model checking of clock
                 constraint specification language for real-time and
                 embedded systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "61--70",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081035",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Clock Constraint Specification Language (CCSL) is
                 a formal language companion to MARTE (shorthand for
                 Modeling and Analysis of Real-Time and Embedded
                 systems), a UML profile used to facilitate the design
                 and analysis of real-time and embedded systems. CCSL is
                 proposed to specify constraints on the occurrences of
                 events in systems. However, the language lacks
                 efficient verification support to formally analyze
                 temporal properties, which are important properties to
                 real-time and embedded systems. In this paper, we
                 propose an SMT-based approach to model checking of the
                 temporal properties specified in Linear Temporal Logic
                 (LTL) for CCSL by transforming CCSL constraints and LTL
                 formulas into SMT formulas. We implement a prototype
                 tool for the proposed approach and use the
                 state-of-the-art tool Z3 as its underlying SMT solver.
                 We model two practical real-time and embedded systems,
                 i.e., a traffic light controller and a power window
                 system in CCSL , and model check LTL properties of them
                 using the proposed approach. Experimental results
                 demonstrate the effectiveness and efficiency of our
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Zheng:2017:ITS,
  author =       "Wenguang Zheng and Hui Wu and Chuanyao Nie",
  title =        "Integrating task scheduling and cache locking for
                 multicore real-time embedded systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "71--80",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081033",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern embedded processors provide hardware support
                 for cache locking, a mechanism used to facilitate the
                 WCET (Worst-Case Execution Time) calculation of a task.
                 We investigate the problem of integrating task
                 scheduling and cache locking for a set of preemptible
                 tasks with individual release times and deadlines on a
                 multi-core processor with two-level caches. We propose
                 a novel integrated approach that schedules the task set
                 and allocates the locked cache contents of each task to
                 the local caches (L1 caches) and the level-two cache
                 (L2 cache). Our approach consists of three major
                 components, the task scheduler, the L1 cache allocator,
                 and the L2 cache allocator. The task scheduler aims at
                 minimizing the number of task preemptions. The L1 cache
                 allocator converts the interference graph of all the
                 tasks scheduled on each core into a DAG by considering
                 the preemptions between tasks and allocates the L1
                 cache space to each task. The L2 cache allocator
                 converts the interference graph of all the tasks into a
                 DAG by using a k-longest-path-based graph orientation
                 algorithm and allocates the L2 cache space to each
                 task. Both cache allocators significantly improve the
                 cache utilization for all the caches due to the
                 efficient use of the interference graphs of tasks. We
                 have implemented our approach and compared it with the
                 extended version of the preemption tree-based approach
                 and the static analysis approach without cache locking
                 by using a set of benchmarks from the MRTC WCET
                 benchmark suite and SNU real-time benchmarks. Compared
                 to the extended version of the preemption tree-based
                 approach, the maximum WCRT (Worst Case Response Time)
                 improvement of our approach is 15\%. Compared to the
                 static analysis approach, the maximum WCRT improvement
                 of our approach is 37\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Wang:2017:TME,
  author =       "Yi Wang and Mingxu Zhang and Jing Yang",
  title =        "Towards memory-efficient processing-in-memory
                 architecture for convolutional neural networks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "81--90",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081032",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Convolutional neural networks (CNNs) are widely
                 adopted in artificial intelligent systems. In contrast
                 to conventional computing centric applications, the
                 computational and memory resources of CNN applications
                 are mixed together in the network weights. This incurs
                 a significant amount of data movement, especially for
                 highdimensional convolutions. Although recent embedded
                 3D-stacked Processing-in-Memory (PIM) architecture
                 alleviates this memory bottleneck to provide fast
                 near-data processing, memory is still a limiting factor
                 of the entire system. An unsolved key challenge is how
                 to efficiently allocate convolutions to 3D-stacked PIM
                 to combine the advantages of both neural and
                 computational processing. This paper presents
                 Memolution, a compiler-based memory efficient data
                 allocation strategy for convolutional neural networks
                 on PIM architecture. Memolution offers thread-level
                 parallelism that can fully exploit the computational
                 power of PIM architecture. The objective is to capture
                 the characteristics of neural network applications and
                 present a hardware-independent design to transparently
                 allocate CNN applications onto the underlining hardware
                 resources provided by PIM. We demonstrate the viability
                 of the proposed technique using a variety of realistic
                 convolutional neural network applications. Our
                 extensive evaluations show that, Memolution
                 significantly improves performance and the cache
                 utilization compared to the baseline scheme.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Ding:2017:UNS,
  author =       "Xianzhong Ding and Zhiyong Zhang and Zhiping Jia and
                 Lei Ju and Mengying Zhao and Huawei Huang",
  title =        "Unified {nvTCAM} and {sTCAM} architecture for
                 improving packet matching performance",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "91--100",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081034",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software-Defined Networking (SDN) allows controlling
                 applications to install fine-grained forwarding
                 policies in the underlying switches. Ternary Content
                 Addressable Memory (TCAM) enables fast lookups in
                 hardware switches with flexible wildcard rule patterns.
                 However, the performance of packet processing is
                 severely constrained by the capacity of TCAM, which
                 aggravates the processing burden and latency issues. In
                 this paper, we propose a hybrid TCAM architecture which
                 consists of NVM-based TCAM (nvTCAM) and SRAM-based TCAM
                 (sTCAM), utilizing nvTCAM to cache the most popular
                 rules to improve cache-hit-ratio while relying on a
                 very small-size sTCAM to handle cache-miss traffic to
                 effectively decrease update latency. Considering the
                 special rule dependency, we present an efficient Rule
                 Migration Replacement (RMR) policy to make full
                 utilization of both nvTCAM and sTCAM to obtain better
                 performance. Experimental results show that the
                 proposed architecture outperforms current TCAM
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Pan:2017:LPM,
  author =       "Chen Pan and Mimi Xie and Yongpan Liu and Yanzhi Wang
                 and Chun Jason Xue and Yuangang Wang and Yiran Chen and
                 Jingtong Hu",
  title =        "A lightweight progress maximization scheduler for
                 non-volatile processor under unstable energy
                 harvesting",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "101--110",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081038",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy harvesting techniques become increasingly
                 popular as power supplies for embedded systems.
                 However, the harvested energy is intrinsically
                 unstable. Thus, the program execution may be
                 interrupted frequently. Although the development of
                 non-volatile processors (NVP) can save and restore
                 execution states, both hardware and software challenges
                 exist for energy harvesting powered embedded systems.
                 On the hardware side, existing power detector only
                 signals the ``poor'' quality of the harvested power
                 based on a preset threshold voltage. The inappropriate
                 setting of this threshold will make the NVP based
                 embedded system suffer from either unnecessary
                 checkpointing or checkpointing failures. On the
                 software side, not all tasks can be checkpointed. Once
                 the power is off, these tasks will have to restart from
                 the beginning. In this paper, a task scheduler is
                 proposed to maximize task progress by prioritizing
                 tasks which cannot be checkpointed when power is weak
                 so that they can finish before the power outage. To
                 assist task scheduling, three additional modules
                 including voltage monitor, checkpointing handler, and
                 routine handler, are proposed. Experimental results
                 show increased overall task progress and reduced energy
                 consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Dietrich:2017:OVA,
  author =       "Christian Dietrich and Daniel Lohmann",
  title =        "{OSEK-V}: application-specific {RTOS} instantiation in
                 hardware",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "111--120",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081030",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The employment of a real-time operating system (RTOS)
                 in an embedded control systems is often an
                 all-or-nothing decision: While the RTOS-abstractions
                 provide for easier software composition and
                 development, the price in terms of event latencies and
                 memory costs are high. Especially in HW/SW codesign
                 settings, system developers try to avoid the employment
                 of a full-blown RTOS as far as possible. In OSEK-V, we
                 mitigate this trade-off by a very aggressive tailoring
                 of the concrete RTOS instance into the hardware.
                 Instead of implementing generic OS components as custom
                 hardware devices, we capture the actually possible
                 application-kernel interactions as a finite-state
                 machine and integrate the tailored RTOS semantics
                 directly into the processor pipeline. In our
                 experimental results with an OSEK-based implementation
                 of a quadrotor flight controller into the Rocket/RISC-V
                 softcore, we thereby can significantly reduce event
                 latencies, interrupt lock times, and memory footprint
                 at moderate costs in terms of FPGA resources.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Pai:2016:CTO,
  author =       "Sreepathi Pai and Keshav Pingali",
  title =        "A compiler for throughput optimization of graph
                 algorithms on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "1--19",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984015",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Writing high-performance GPU implementations of graph
                 algorithms can be challenging. In this paper, we argue
                 that three optimizations called throughput
                 optimizations are key to high-performance for this
                 application class. These optimizations describe a large
                 implementation space making it unrealistic for
                 programmers to implement them by hand. To address this
                 problem, we have implemented these optimizations in a
                 compiler that produces CUDA code from an
                 intermediate-level program representation called IrGL.
                 Compared to state-of-the-art handwritten CUDA
                 implementations of eight graph applications, code
                 generated by the IrGL compiler is up to 5.95x times
                 faster (median 1.4x) for five applications and never
                 more than 30\% slower for the others. Throughput
                 optimizations contribute an improvement up to 4.16x
                 (median 1.4x) to the performance of unoptimized IrGL
                 code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Surendran:2016:APP,
  author =       "Rishi Surendran and Vivek Sarkar",
  title =        "Automatic parallelization of pure method calls via
                 conditional future synthesis",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "20--38",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984035",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a novel approach for using futures to
                 automatically parallelize the execution of pure method
                 calls. Our approach is built on three new techniques to
                 address the challenge of automatic parallelization via
                 future synthesis: candidate future synthesis,
                 parallelism benefit analysis, and threshold expression
                 synthesis. During candidate future synthesis, our
                 system annotates pure method calls as async expressions
                 and synthesizes a parallel program with future objects
                 and their type declarations. Next, the system performs
                 a parallel benefit analysis to determine which async
                 expressions may need to be executed sequentially due to
                 overhead reasons, based on execution profile
                 information collected from multiple test inputs.
                 Finally, threshold expression synthesis uses the output
                 from parallelism benefit analysis to synthesize
                 predicate expressions that can be used to determine at
                 runtime if a specific pure method call should be
                 executed sequentially or in parallel. We have
                 implemented our approach, and the results obtained from
                 an experimental evaluation of the complete system on a
                 range of sequential Java benchmarks are very
                 encouraging. Our evaluation shows that our approach can
                 provide significant parallel speedups of up to 7.4 $
                 \times $ (geometric mean of 3.69 $ \times $) relative
                 to the sequential programs when using 8 processor
                 cores, with zero programmer effort beyond providing the
                 sequential program and test cases for parallelism
                 benefit analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Sorensen:2016:PIW,
  author =       "Tyler Sorensen and Alastair F. Donaldson and Mark
                 Batty and Ganesh Gopalakrishnan and Zvonimir
                 Rakamari{\'c}",
  title =        "Portable inter-workgroup barrier synchronisation for
                 {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "39--58",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984032",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the growing popularity of GPGPU programming,
                 there is not yet a portable and formally-specified
                 barrier that one can use to synchronise across
                 workgroups. Moreover, the occupancy-bound execution
                 model of GPUs breaks assumptions inherent in
                 traditional software execution barriers, exposing them
                 to deadlock. We present an occupancy discovery protocol
                 that dynamically discovers a safe estimate of the
                 occupancy for a given GPU and kernel, allowing for a
                 starvation-free (and hence, deadlock-free)
                 inter-workgroup barrier by restricting the number of
                 workgroups according to this estimate. We implement
                 this idea by adapting an existing, previously
                 non-portable, GPU inter-workgroup barrier to use OpenCL
                 2.0 atomic operations, and prove that the barrier meets
                 its natural specification in terms of synchronisation.
                 We assess the portability of our approach over eight
                 GPUs spanning four vendors, comparing the performance
                 of our method against alternative methods. Our key
                 findings include: (1){\^A} the recall of our discovery
                 protocol is nearly 100\%; (2){\^A} runtime comparisons
                 vary substantially across GPUs and applications; and
                 (3){\^A} our method provides portable and safe
                 inter-workgroup synchronisation across the applications
                 we study.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Doeraene:2016:PIW,
  author =       "S{\'e}bastien Doeraene and Tobias Schlatter",
  title =        "Parallel incremental whole-program optimizations for
                 {Scala.js}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "59--73",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984013",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Whole-program optimizations are powerful tools that
                 can dramatically improve performance, size and other
                 aspects of programs. Because they depend on global
                 knowledge, they must typically be reapplied to the
                 whole program when small changes are made, which makes
                 them too slow for the development cycle. This is an
                 issue for some environments that require, or benefit a
                 lot from, whole-program optimizations, such as
                 compilation to JavaScript or to the Dalvik VM, because
                 their development cycle is slowed down either by the
                 lack of optimizations, or by the time spent on applying
                 them. We present a new approach to designing
                 incremental whole-program optimizers for
                 object-oriented and functional languages: when part of
                 a program changes, only the portions affected by the
                 changes are reoptimized. An incremental optimizer using
                 this approach for Scala.js, the Scala to JavaScript
                 compiler, demonstrates speedups from 10x to 100x
                 compared to its batch version. As a result, the
                 optimizer's running time becomes insignificant compared
                 to separate compilation, making it fit for use on every
                 compilation run during the development cycle. We also
                 show how to parallelize the incremental algorithm to
                 take advantage of multicore hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Stefanescu:2016:SBP,
  author =       "Andrei Stefanescu and Daejun Park and Shijiao Yuwen
                 and Yilong Li and Grigore Rosu",
  title =        "Semantics-based program verifiers for all languages",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "74--91",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984027",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a language-independent verification
                 framework that can be instantiated with an operational
                 semantics to automatically generate a program verifier.
                 The framework treats both the operational semantics and
                 the program correctness specifications as reachability
                 rules between matching logic patterns, and uses the
                 sound and relatively complete reachability logic proof
                 system to prove the specifications using the semantics.
                 We instantiate the framework with the semantics of one
                 academic language, KernelC, as well as with three
                 recent semantics of real-world languages, C, Java, and
                 JavaScript, developed independently of our verification
                 infrastructure. We evaluate our approach empirically
                 and show that the generated program verifiers can check
                 automatically the full functional correctness of
                 challenging heap-manipulating programs implementing
                 operations on list and tree data structures, like AVL
                 trees. This is the first approach that can turn the
                 operational semantics of real-world languages into
                 correct-by-construction automatic verifiers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Sergey:2016:HSS,
  author =       "Ilya Sergey and Aleksandar Nanevski and Anindya
                 Banerjee and Germ{\'a}n Andr{\'e}s Delbianco",
  title =        "{Hoare}-style specifications as correctness conditions
                 for non-linearizable concurrent objects",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "92--110",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983999",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Designing efficient concurrent objects often requires
                 abandoning the standard specification technique of
                 linearizability in favor of more relaxed correctness
                 conditions. However, the variety of alternatives makes
                 it difficult to choose which condition to employ, and
                 how to compose them when using objects specified by
                 different conditions. In this work, we propose a
                 uniform alternative in the form of Hoare logic, which
                 can explicitly capture--in the auxiliary state--the
                 interference of environment threads. We demonstrate the
                 expressiveness of our method by verifying a number of
                 concurrent objects and their clients, which have so far
                 been specified only by non-standard conditions of
                 concurrency-aware linearizability, quiescent, and
                 quantitative quiescent consistency. We report on the
                 implementation of the ideas in an existing Coq-based
                 tool, providing the first mechanized proofs for all the
                 examples in the paper.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Nienhuis:2016:OSC,
  author =       "Kyndylan Nienhuis and Kayvan Memarian and Peter
                 Sewell",
  title =        "An operational semantics for {C\slash C++11}
                 concurrency",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "111--128",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983997",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The C/C++11 concurrency model balances two goals: it
                 is relaxed enough to be efficiently implementable and
                 (leaving aside the ``thin-air'' problem) it is strong
                 enough to give useful guarantees to programmers. It is
                 mathematically precise and has been used in
                 verification research and compiler testing. However,
                 the model is expressed in an axiomatic style, as
                 predicates on complete candidate executions. This
                 suffices for computing the set of allowed executions of
                 a small litmus test, but it does not directly support
                 the incremental construction of executions of larger
                 programs. It is also at odds with conventional
                 operational semantics, as used implicitly in the rest
                 of the C/C++ standards. Our main contribution is the
                 development of an operational model for C/C++11
                 concurrency. This covers all the features of the
                 previous formalised axiomatic model, and we have a
                 mechanised proof that the two are equivalent, in
                 Isabelle/HOL. We also integrate this semantics with an
                 operational semantics for sequential C (described
                 elsewhere); the combined semantics can incrementally
                 execute programs in a small fragment of C. Doing this
                 uncovered several new aspects of the C/C++11 model: we
                 show that one cannot build an equivalent operational
                 model that simply follows program order, sequential
                 consistent order, or the synchronises-with order. The
                 first negative result is forced by hardware-observable
                 behaviour, but the latter two are not, and so might be
                 ameliorated by changing C/C++11. More generally, we
                 hope that this work, with its focus on incremental
                 construction of executions, will inform the future
                 design of new concurrency models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Dan:2016:MAR,
  author =       "Andrei Marian Dan and Patrick Lam and Torsten Hoefler
                 and Martin Vechev",
  title =        "Modeling and analysis of remote memory access
                 programming",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "129--144",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984033",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent advances in networking hardware have led to a
                 new generation of Remote Memory Access (RMA) networks
                 in which processors from different machines can
                 communicate directly, bypassing the operating system
                 and allowing higher performance. Researchers and
                 practitioners have proposed libraries and programming
                 models for RMA to enable the development of
                 applications running on these networks, However, the
                 memory models implied by these RMA libraries and
                 languages are often loosely specified, poorly
                 understood, and differ depending on the underlying
                 network architecture and other factors. Hence, it is
                 difficult to precisely reason about the semantics of
                 RMA programs or how changes in the network architecture
                 affect them. We address this problem with the following
                 contributions: (i) a coreRMA language which serves as a
                 common foundation, formalizing the essential
                 characteristics of RMA programming; (ii) complete
                 axiomatic semantics for that language; (iii)
                 integration of our semantics with an existing
                 constraint solver, enabling us to exhaustively generate
                 coreRMA programs (litmus tests) up to a specified bound
                 and check whether the tests satisfy their
                 specification; and (iv) extensive validation of our
                 semantics on real-world RMA systems. We generated and
                 ran 7441 litmus tests using each of the low-level RMA
                 network APIs: DMAPP, VPI Verbs, and Portals 4. Our
                 results confirmed that our model successfully captures
                 behaviors exhibited by these networks. Moreover, we
                 found RMA programs that behave inconsistently with
                 existing documentation, confirmed by network experts.
                 Our work provides an important step towards
                 understanding existing RMA networks, thus influencing
                 the design of future RMA interfaces and hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Itzhaky:2016:DDC,
  author =       "Shachar Itzhaky and Rohit Singh and Armando
                 Solar-Lezama and Kuat Yessenov and Yongquan Lu and
                 Charles Leiserson and Rezaul Chowdhury",
  title =        "Deriving divide-and-conquer dynamic programming
                 algorithms using solver-aided transformations",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "145--164",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983993",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a framework allowing domain experts to
                 manipulate computational terms in the interest of
                 deriving better, more efficient implementations.It
                 employs deductive reasoning to generate provably
                 correct efficient implementations from a very
                 high-level specification of an algorithm, and inductive
                 constraint-based synthesis to improve automation.
                 Semantic information is encoded into program terms
                 through the use of refinement types. In this paper, we
                 develop the technique in the context of a system called
                 Bellmania that uses solver-aided tactics to derive
                 parallel divide-and-conquer implementations of dynamic
                 programming algorithms that have better locality and
                 are significantly more efficient than traditional
                 loop-based implementations. Bellmania includes a
                 high-level language for specifying dynamic programming
                 algorithms and a calculus that facilitates gradual
                 transformation of these specifications into efficient
                 implementations. These transformations formalize the
                 divide-and conquer technique; a visualization interface
                 helps users to interactively guide the process, while
                 an SMT-based back-end verifies each step and takes care
                 of low-level reasoning required for parallelism. We
                 have used the system to generate provably correct
                 implementations of several algorithms, including some
                 important algorithms from computational biology, and
                 show that the performance is comparable to that of the
                 best manually optimized code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Srinivasan:2016:SMC,
  author =       "Venkatesh Srinivasan and Tushar Sharma and Thomas
                 Reps",
  title =        "Speeding up machine-code synthesis",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "165--180",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984006",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Machine-code synthesis is the problem of searching for
                 an instruction sequence that implements a semantic
                 specification, given as a formula in quantifier-free
                 bit-vector logic (QFBV). Instruction sets like Intel's
                 IA-32 have around 43,000 unique instruction schemas;
                 this huge instruction pool, along with the exponential
                 cost inherent in enumerative synthesis, results in an
                 enormous search space for a machine-code synthesizer:
                 even for relatively small specifications, the
                 synthesizer might take several hours or days to find an
                 implementation. In this paper, we present several
                 improvements to the algorithms used in a
                 state-of-the-art machine-code synthesizer McSynth. In
                 addition to a novel pruning heuristic, our improvements
                 incorporate a number of ideas known from the
                 literature, which we adapt in novel ways for the
                 purpose of speeding up machine-code synthesis. Our
                 experiments for Intel's IA-32 instruction set show that
                 our improvements enable synthesis of code for 12 out of
                 14 formulas on which McSynth times out, speeding up the
                 synthesis time by at least 1981X, and for the remaining
                 formulas, speeds up synthesis by 3X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Panchekha:2016:ARW,
  author =       "Pavel Panchekha and Emina Torlak",
  title =        "Automated reasoning for web page layout",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "181--194",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984010",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Web pages define their appearance using Cascading
                 Style Sheets, a modular language for layout of
                 tree-structured documents. In principle, using CSS is
                 easy: the developer specifies declarative constraints
                 on the layout of an HTML document (such as the
                 positioning of nodes in the HTML tree), and the browser
                 solves the constraints to produce a box-based rendering
                 of that document. In practice, however, the subtleties
                 of CSS semantics make it difficult to develop
                 stylesheets that produce the intended layout across
                 different user preferences and browser settings. This
                 paper presents the first mechanized formalization of a
                 substantial fragment of the CSS semantics. This
                 formalization is equipped with an efficient reduction
                 to the theory of quantifier-free linear real
                 arithmetic, enabling effective automated reasoning
                 about CSS stylesheets and their behavior. We implement
                 this reduction in Cassius, a solver-aided framework for
                 building semantics-aware tools for CSS. To demonstrate
                 the utility of Cassius, we prototype new tools for
                 automated verification, debugging, and synthesis of CSS
                 code. We show that these tools work on fragments of
                 real-world websites, and that Cassius is a practical
                 first step toward solver-aided programming for the
                 web.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Wang:2016:FFS,
  author =       "Xinyu Wang and Sumit Gulwani and Rishabh Singh",
  title =        "{FIDEX}: filtering spreadsheet data using examples",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "195--213",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984030",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data filtering in spreadsheets is a common problem
                 faced by millions of end-users. The task of data
                 filtering requires a computational model that can
                 separate intended positive and negative string
                 instances. We present a system, FIDEX, that can
                 efficiently learn desired data filtering expressions
                 from a small set of positive and negative string
                 examples. There are two key ideas of our approach.
                 First, we design an expressive DSL to represent
                 disjunctive filter expressions needed for several
                 real-world data filtering tasks. Second, we develop an
                 efficient synthesis algorithm for incrementally
                 learning consistent filter expressions in the DSL from
                 very few positive and negative examples. A DAG-based
                 data structure is used to succinctly represent a large
                 number of filter expressions, and two corresponding
                 operators are defined for algorithmically handling
                 positive and negative examples, namely, the
                 intersection and subtraction operators. FIDEX is able
                 to learn data filters for 452 out of 460 real-world
                 data filtering tasks in real time (0.22s), using only
                 2.2 positive string instances and 2.7 negative string
                 instances on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Moore:2016:EAC,
  author =       "Scott Moore and Christos Dimoulas and Robert Bruce
                 Findler and Matthew Flatt and Stephen Chong",
  title =        "Extensible access control with authorization
                 contracts",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "214--233",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984021",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Existing programming language access control
                 frameworks do not meet the needs of all software
                 components. We propose an expressive framework for
                 implementing access control monitors for components.
                 The basis of the framework is a novel concept: the
                 authority environment. An authority environment
                 associates rights with an execution context. The
                 building blocks of access control monitors in our
                 framework are authorization contracts: software
                 contracts that manage authority environments. We
                 demonstrate the expressiveness of our framework by
                 implementing a diverse set of existing access control
                 mechanisms and writing custom access control monitors
                 for three realistic case studies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Osvald:2016:GGT,
  author =       "Leo Osvald and Gr{\'e}gory Essertel and Xilun Wu and
                 Lilliam I. Gonz{\'a}lez Alay{\'o}n and Tiark Rompf",
  title =        "Gentrification gone too far? {Affordable} 2nd-class
                 values for fun and (co-)effect",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "234--251",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984009",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "First-class functions dramatically increase
                 expressiveness, at the expense of static guarantees. In
                 ALGOL or PASCAL, functions could be passed as arguments
                 but never escape their defining scope. Therefore,
                 function arguments could serve as temporary access
                 tokens or capabilities, enabling callees to perform
                 some action, but only for the duration of the call. In
                 modern languages, such programming patterns are no
                 longer available. The central thrust of this paper is
                 to re-introduce second-class functions and other values
                 alongside first-class entities in modern languages. We
                 formalize second-class values with stack-bounded
                 lifetimes as an extension to simply-typed $ \lambda $
                 calculus, and for richer type systems such as F$_{ <
                 \colon }$ and systems with path-dependent types. We
                 generalize the binary first- vs second-class
                 distinction to arbitrary privilege lattices, with the
                 underlying type lattice as a special case. In this
                 setting, abstract types naturally enable privilege
                 parametricity. We prove type soundness and lifetime
                 properties in Coq. We implement our system as an
                 extension of Scala, and present several case studies.
                 First, we modify the Scala Collections library and add
                 privilege annotations to all higher-order functions.
                 Privilege parametricity is key to retain the high
                 degree of code-reuse between sequential and parallel as
                 well as lazy and eager collections. Second, we use
                 scoped capabilities to introduce a model of checked
                 exceptions in the Scala library, with only few changes
                 to the code. Third, we employ second-class capabilities
                 for memory safety in a region-based off-heap memory
                 library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{DiLorenzo:2016:IFD,
  author =       "Jonathan DiLorenzo and Richard Zhang and Erin Menzies
                 and Kathleen Fisher and Nate Foster",
  title =        "Incremental forest: a {DSL} for efficiently managing
                 filestores",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "252--271",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984034",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "File systems are often used to store persistent
                 application data, but manipulating file systems using
                 standard APIs can be difficult for programmers. Forest
                 is a domain-specific language that bridges the gap
                 between the on-disk and in-memory representations of
                 file system data. Given a high-level specification of
                 the structure, contents, and properties of a collection
                 of directories, files, and symbolic links, the Forest
                 compiler generates tools for loading, storing, and
                 validating that data. Unfortunately, the initial
                 implementation of Forest offered few mechanisms for
                 controlling cost --- e.g., the run-time system could
                 load gigabytes of data, even if only a few bytes were
                 needed. This paper introduces Incremental Forest
                 (iForest), an extension to Forest with an explicit
                 delay construct that programmers can use to precisely
                 control costs. We describe the design of iForest using
                 a series of running examples, present a formal
                 semantics in a core calculus, and define a simple cost
                 model that accurately characterizes the resources
                 needed to use a given specification. We propose skins,
                 which allow programmers to modify the delay structure
                 of a specification in a compositional way, and develop
                 a static type system for ensuring compatibility between
                 specifications and skins. We prove the soundness and
                 completeness of the type system and a variety of
                 algebraic properties of skins. We describe an OCaml
                 implementation and evaluate its performance on
                 applications developed in collaboration with watershed
                 hydrologists.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Haller:2016:LLA,
  author =       "Philipp Haller and Alex Loiko",
  title =        "{LaCasa}: lightweight affinity and object capabilities
                 in {Scala}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "272--291",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984042",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Aliasing is a known source of challenges in the
                 context of imperative object-oriented languages, which
                 have led to important advances in type systems for
                 aliasing control. However, their large-scale adoption
                 has turned out to be a surprisingly difficult
                 challenge. While new language designs show promise,
                 they do not address the need of aliasing control in
                 existing languages. This paper presents a new approach
                 to isolation and uniqueness in an existing, widely-used
                 language, Scala. The approach is unique in the way it
                 addresses some of the most important obstacles to the
                 adoption of type system extensions for aliasing
                 control. First, adaptation of existing code requires
                 only a minimal set of annotations. Only a single bit of
                 information is required per class. Surprisingly, the
                 paper shows that this information can be provided by
                 the object-capability discipline, widely-used in
                 program security. We formalize our approach as a type
                 system and prove key soundness theorems. The type
                 system is implemented for the full Scala language,
                 providing, for the first time, a sound integration with
                 Scala's local type inference. Finally, we empirically
                 evaluate the conformity of existing Scala open-source
                 code on a corpus of over 75,000 LOC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{PerezDeRosso:2016:PCM,
  author =       "Santiago {Perez De Rosso} and Daniel Jackson",
  title =        "Purposes, concepts, misfits, and a redesign of git",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "292--310",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984018",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Git is a widely used version control system that is
                 powerful but complicated. Its complexity may not be an
                 inevitable consequence of its power but rather evidence
                 of flaws in its design. To explore this hypothesis, we
                 analyzed the design of Git using a theory that
                 identifies concepts, purposes, and misfits. Some
                 well-known difficulties with Git are described, and
                 explained as misfits in which underlying concepts fail
                 to meet their intended purpose. Based on this analysis,
                 we designed a reworking of Git (called Gitless) that
                 attempts to remedy these flaws. To correlate misfits
                 with issues reported by users, we conducted a study of
                 Stack Overflow questions. And to determine whether
                 users experienced fewer complications using Gitless in
                 place of Git, we conducted a small user study. Results
                 suggest our approach can be profitable in identifying,
                 analyzing, and fixing design problems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Kim:2016:AAP,
  author =       "Dohyeong Kim and Yonghwi Kwon and Peng Liu and I. Luk
                 Kim and David Mitchel Perry and Xiangyu Zhang and
                 Gustavo Rodriguez-Rivera",
  title =        "{Apex}: automatic programming assignment error
                 explanation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "311--327",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984031",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents Apex, a system that can
                 automatically generate explanations for programming
                 assignment bugs, regarding where the bugs are and how
                 the root causes led to the runtime failures. It works
                 by comparing the passing execution of a correct
                 implementation (provided by the instructor) and the
                 failing execution of the buggy implementation
                 (submitted by the student). The technique overcomes a
                 number of technical challenges caused by syntactic and
                 semantic differences of the two implementations. It
                 collects the symbolic traces of the executions and
                 matches assignment statements in the two execution
                 traces by reasoning about symbolic equivalence. It then
                 matches predicates by aligning the control dependences
                 of the matched assignment statements, avoiding direct
                 matching of path conditions which are usually quite
                 different. Our evaluation shows that Apex is every
                 effective for 205 buggy real world student submissions
                 of 4 programming assignments, and a set of 15
                 programming assignment type of buggy programs collected
                 from stackoverflow.com, precisely pinpointing the root
                 causes and capturing the causality for 94.5\% of them.
                 The evaluation on a standard benchmark set with over
                 700 student bugs shows similar results. A user study in
                 the classroom shows that Apex has substantially
                 improved student productivity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Hanappi:2016:ARC,
  author =       "Oliver Hanappi and Waldemar Hummer and Schahram
                 Dustdar",
  title =        "Asserting reliable convergence for configuration
                 management scripts",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "328--343",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984000",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The rise of elastically scaling applications that
                 frequently deploy new machines has led to the adoption
                 of DevOps practices across the cloud engineering stack.
                 So-called configuration management tools utilize
                 scripts that are based on declarative resource
                 descriptions and make the system converge to the
                 desired state. It is crucial for convergent
                 configurations to be able to gracefully handle
                 transient faults, e.g., network outages when
                 downloading and installing software packages. In this
                 paper we introduce a conceptual framework for asserting
                 reliable convergence in configuration management. Based
                 on a formal definition of configuration scripts and
                 their resources, we utilize state transition graphs to
                 test whether a script makes the system converge to the
                 desired state under different conditions. In our
                 generalized model, configuration actions are partially
                 ordered, often resulting in prohibitively many possible
                 execution orders. To reduce this problem space, we
                 define and analyze a property called preservation, and
                 we show that if preservation holds for all pairs of
                 resources, then convergence holds for the entire
                 configuration. Our implementation builds on Puppet, but
                 the approach is equally applicable to other frameworks
                 like Chef, Ansible, etc. We perform a comprehensive
                 evaluation based on real world Puppet scripts and show
                 the effectiveness of the approach. Our tool is able to
                 detect all idempotence and convergence related issues
                 in a set of existing Puppet scripts with known issues
                 as well as some hitherto undiscovered bugs in a large
                 random sample of scripts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Treichler:2016:DP,
  author =       "Sean Treichler and Michael Bauer and Rahul Sharma and
                 Elliott Slaughter and Alex Aiken",
  title =        "Dependent partitioning",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "344--358",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984016",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A key problem in parallel programming is how data is
                 partitioned: divided into subsets that can be operated
                 on in parallel and, in distributed memory machines,
                 spread across multiple address spaces. We present a
                 dependent partitioning framework that allows an
                 application to concisely describe relationships between
                 partitions. Applications first establish independent
                 partitions, which may contain arbitrary subsets of
                 application data, permitting the expression of
                 arbitrary application-specific data distributions.
                 Dependent partitions are then derived from these using
                 the dependent partitioning operations provided by the
                 framework. By directly capturing inter-partition
                 relationships, our framework can soundly and precisely
                 reason about programs to perform important program
                 analyses crucial to ensuring correctness and achieving
                 good performance. As an example of the reasoning made
                 possible, we present a static analysis that discharges
                 most consistency checks on partitioned data during
                 compilation. We describe an implementation of our
                 framework within Regent, a language designed for the
                 Legion programming model. The use of dependent
                 partitioning constructs results in a 86-96\% decrease
                 in the lines of code required to describe the
                 partitioning, eliminates many of the expensive dynamic
                 checks required for soundness by the current Regent
                 partitioning implementation, and speeds up the
                 computation of partitions by 2.6-12.7X even on a single
                 thread. Additionally, we show that a distributed
                 implementation incorporated into the Legion runtime
                 system allows partitioning of data sets that are too
                 large to fit on a single node and yields a further 29X
                 speedup of partitioning operations on 64 nodes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Kulkarni:2016:APA,
  author =       "Sulekha Kulkarni and Ravi Mangal and Xin Zhang and
                 Mayur Naik",
  title =        "Accelerating program analyses by cross-program
                 training",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "359--377",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984023",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Practical programs share large modules of code.
                 However, many program analyses are ineffective at
                 reusing analysis results for shared code across
                 programs. We present POLYMER, an analysis optimizer to
                 address this problem. POLYMER runs the analysis offline
                 on a corpus of training programs and learns analysis
                 facts over shared code. It prunes the learnt facts to
                 eliminate intermediate computations and then reuses
                 these pruned facts to accelerate the analysis of other
                 programs that share code with the training corpus. We
                 have implemented POLYMER to accelerate analyses
                 specified in Datalog, and apply it to optimize two
                 analyses for Java programs: a call-graph analysis that
                 is flow- and context-insensitive, and a points-to
                 analysis that is flow- and context-sensitive. We
                 evaluate the resulting analyses on ten programs from
                 the DaCapo suite that share the JDK library. POLYMER
                 achieves average speedups of 2.6$ \times $ for the
                 call-graph analysis and 5.2$ \times $ for the points-to
                 analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Srinivasan:2016:IAS,
  author =       "Venkatesh Srinivasan and Thomas Reps",
  title =        "An improved algorithm for slicing machine code",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "378--393",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984003",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Machine-code slicing is an important primitive for
                 building binary analysis and rewriting tools, such as
                 taint trackers, fault localizers, and partial
                 evaluators. However, it is not easy to create a
                 machine-code slicer that exhibits a high level of
                 precision. Moreover, the problem of creating such a
                 tool is compounded by the fact that a small amount of
                 local imprecision can be amplified via cascade effects.
                 Most instructions in instruction sets such as Intel's
                 IA-32 and ARM are multi-assignments: they have several
                 inputs and several outputs (registers, flags, and
                 memory locations). This aspect of the instruction set
                 introduces a granularity issue during slicing: there
                 are often instructions at which we would like the slice
                 to include only a subset of the instruction's
                 semantics, whereas the slice is forced to include the
                 entire instruction. Consequently, the slice computed by
                 state-of-the-art tools is very imprecise, often
                 including essentially the entire program. This paper
                 presents an algorithm to slice machine code more
                 accurately. To counter the granularity issue, our
                 algorithm performs slicing at the microcode level,
                 instead of the instruction level, and obtains a more
                 precise microcode slice. To reconstitute a machine-code
                 program from a microcode slice, our algorithm uses
                 machine-code synthesis. Our experiments on IA-32
                 binaries of FreeBSD utilities show that, in comparison
                 to slices computed by a state-of-the-art tool, our
                 algorithm reduces the size of backward slices by 33\%,
                 and forward slices by 70\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Petrashko:2016:CGL,
  author =       "Dmitry Petrashko and Vlad Ureche and Ondrej Lhot{\'a}k
                 and Martin Odersky",
  title =        "Call graphs for languages with parametric
                 polymorphism",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "394--409",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983991",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The performance of contemporary object oriented
                 languages depends on optimizations such as
                 devirtualization, inlining, and specialization, and
                 these in turn depend on precise call graph analysis.
                 Existing call graph analyses do not take advantage of
                 the information provided by the rich type systems of
                 contemporary languages, in particular generic type
                 arguments. Many existing approaches analyze Java
                 bytecode, in which generic types have been erased. This
                 paper shows that this discarded information is actually
                 very useful as the context in a context-sensitive
                 analysis, where it significantly improves precision and
                 keeps the running time small. Specifically, we propose
                 and evaluate call graph construction algorithms in
                 which the contexts of a method are (i) the type
                 arguments passed to its type parameters, and (ii) the
                 static types of the arguments passed to its term
                 parameters. The use of static types from the caller as
                 context is effective because it allows more precise
                 dispatch of call sites inside the callee. Our
                 evaluation indicates that the average number of
                 contexts required per method is small. We implement the
                 analysis in the Dotty compiler for Scala, and evaluate
                 it on programs that use the type-parametric Scala
                 collections library and on the Dotty compiler itself.
                 The context-sensitive analysis runs 1.4x faster than a
                 context-insensitive one and discovers 20\% more
                 monomorphic call sites at the same time. When applied
                 to method specialization, the imprecision in a
                 context-insensitive call graph would require the
                 average method to be cloned 22 times, whereas the
                 context-sensitive call graph indicates a much more
                 practical 1.00 to 1.50 clones per method. We applied
                 the proposed analysis to automatically specialize
                 generic methods. The resulting automatic transformation
                 achieves the same performance as state-of-the-art
                 techniques requiring manual annotations, while reducing
                 the size of the generated bytecode by up to 5 $ \times
                 $.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Chandra:2016:TIS,
  author =       "Satish Chandra and Colin S. Gordon and Jean-Baptiste
                 Jeannin and Cole Schlesinger and Manu Sridharan and
                 Frank Tip and Youngil Choi",
  title =        "Type inference for static compilation of
                 {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "410--429",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984017",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a type system and inference algorithm for a
                 rich subset of JavaScript equipped with objects,
                 structural subtyping, prototype inheritance, and
                 first-class methods. The type system supports abstract
                 and recursive objects, and is expressive enough to
                 accommodate several standard benchmarks with only minor
                 workarounds. The invariants enforced by the types
                 enable an ahead-of-time compiler to carry out
                 optimizations typically beyond the reach of static
                 compilers for dynamic languages. Unlike previous
                 inference techniques for prototype inheritance, our
                 algorithm uses a combination of lower and upper bound
                 propagation to infer types and discover type errors in
                 all code, including uninvoked functions. The inference
                 is expressed in a simple constraint language, designed
                 to leverage off-the-shelf fixed point solvers. We prove
                 soundness for both the type system and inference
                 algorithm. An experimental evaluation showed that the
                 inference is powerful, handling the aforementioned
                 benchmarks with no manual type annotation, and that the
                 inferred types enable effective static compilation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Samak:2016:DSF,
  author =       "Malavika Samak and Omer Tripp and Murali Krishna
                 Ramanathan",
  title =        "Directed synthesis of failing concurrent executions",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "430--446",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984040",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Detecting concurrency-induced bugs in multithreaded
                 libraries can be challenging due to the intricacies
                 associated with their manifestation. This includes
                 invocation of multiple methods, synthesis of inputs to
                 the methods to reach the failing location, and crafting
                 of thread interleavings that cause the erroneous
                 behavior. Neither fuzzing-based testing techniques nor
                 over-approximate static analyses are well positioned to
                 detect such subtle defects while retaining high
                 accuracy alongside satisfactory coverage. In this
                 paper, we propose a directed, iterative and scalable
                 testing engine that combines the strengths of static
                 and dynamic analysis to help synthesize concurrent
                 executions to expose complex concurrency-induced bugs.
                 Our engine accepts as input the library, its client
                 (either sequential or concurrent) and a specification
                 of correctness. Then, it iteratively refines the client
                 to generate an execution that can break the input
                 specification. Each step of the iterative process
                 includes statically identifying sub-goals towards the
                 goal of failing the specification, generating a plan
                 toward meeting these goals, and merging of the paths
                 traversed dynamically with the plan computed statically
                 via constraint solving to generate a new client. The
                 engine reports full reproduction scenarios, guaranteed
                 to be true, for the bugs it finds. We have created a
                 prototype of our approach named MINION. We validated
                 MINION by applying it to well-tested concurrent classes
                 from popular Java libraries, including the latest
                 versions of OpenJDK and Google-Guava. We were able to
                 detect 31 real crashes across 10 classes in a total of
                 23 minutes, including previously unknown bugs.
                 Comparison with three other tools reveals that
                 combined, they report only 9 of the 31 crashes (and no
                 other crashes beyond MINION). This is because several
                 of these bugs manifest under deeply nested path
                 conditions (observed maximum of 11), deep nesting of
                 method invocations (observed maximum of 6) and multiple
                 refinement iterations to generate the crash-inducing
                 client.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Huang:2016:MCR,
  author =       "Shiyou Huang and Jeff Huang",
  title =        "Maximal causality reduction for {TSO} and {PSO}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "447--461",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984025",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Verifying concurrent programs is challenging due to
                 the exponentially large thread interleaving space. The
                 problem is exacerbated by relaxed memory models such as
                 Total Store Order (TSO) and Partial Store Order (PSO)
                 which further explode the interleaving space by
                 reordering instructions. A recent advance, Maximal
                 Causality Reduction (MCR), has shown great promise to
                 improve verification effectiveness by maximally
                 reducing redundant explorations. However, the original
                 MCR only works for the Sequential Consistency (SC)
                 memory model, but not for TSO and PSO. In this paper,
                 we develop novel extensions to MCR by solving two key
                 problems under TSO and PSO: (1) generating
                 interleavings that can reach new states by encoding the
                 operational semantics of TSO and PSO with first-order
                 logical constraints and solving them with SMT solvers,
                 and (2) enforcing TSO and PSO interleavings by
                 developing novel replay algorithms that allow
                 executions out of the program order. We show that our
                 approach successfully enables MCR to effectively
                 explore TSO and PSO interleavings. We have compared our
                 approach with a recent Dynamic Partial Order Reduction
                 (DPOR) algorithm for TSO and PSO and a SAT-based
                 stateless model checking approach. Our results show
                 that our approach is much more effective than the other
                 approaches for both state-space exploration and bug
                 finding --- on average it explores 5-10X fewer
                 executions and finds many bugs that the other tools
                 cannot find.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Huang:2016:PMR,
  author =       "Jeff Huang and Arun K. Rajagopalan",
  title =        "Precise and maximal race detection from incomplete
                 traces",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "462--476",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984024",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present RDIT, a novel dynamic technique to detect
                 data races in multithreaded programs with incomplete
                 trace information, i.e., in the presence of missing
                 events. RDIT is both precise and maximal: it does not
                 report any false alarms and it detects a maximal set of
                 true traces from the observed incomplete trace. RDIT is
                 underpinned by a sound BarrierPair model that abstracts
                 away the missing events by capturing the invocation
                 data of their enclosing methods. By making the least
                 conservative abstraction that a missing method
                 introduces synchronization only when it has a memory
                 address in scope that overlaps with other events or
                 other missing methods, and by formulating maximal
                 thread causality as logical constraints, RDIT
                 guarantees to precisely detect races with maximal
                 capability. RDIT has been applied in seven real-world
                 large concurrent systems and has detected dozens of
                 true races with zero false alarms. Comparatively,
                 existing algorithms such as Happens-Before,
                 Causal-Precedes, and Maximal-Causality which are known
                 to be precise all report many false alarms when missing
                 synchronizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Blum:2016:SMC,
  author =       "Ben Blum and Garth Gibson",
  title =        "Stateless model checking with data-race preemption
                 points",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "477--493",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984036",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Stateless model checking is a powerful technique for
                 testing concurrent programs, but suffers from
                 exponential state space explosion when the test input
                 parameters are too large. Several reduction techniques
                 can mitigate this explosion, but even after pruning
                 equivalent interleavings, the state space size is often
                 intractable. Most prior tools are limited to preempting
                 only on synchronization APIs, which reduces the space
                 further, but can miss unsynchronized thread
                 communication bugs. Data race detection, another
                 concurrency testing approach, focuses on suspicious
                 memory access pairs during a single test execution. It
                 avoids concerns of state space size, but may report
                 races that do not lead to observable failures, which
                 jeopardizes a user's willingness to use the analysis.
                 We present Quicksand, a new stateless model checking
                 framework which manages the exploration of many state
                 spaces using different preemption points. It uses state
                 space estimation to prioritize jobs most likely to
                 complete in a fixed CPU budget, and it incorporates
                 data-race analysis to add new preemption points on the
                 fly. Preempting threads during a data race's
                 instructions can automatically classify the race as
                 buggy or benign, and uncovers new bugs not reachable by
                 prior model checkers. It also enables full verification
                 of all possible schedules when every data race is
                 verified as benign within the CPU budget. In our
                 evaluation, Quicksand found 1.25x as many bugs and
                 verified 4.3x as many tests compared to prior model
                 checking approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Gollamudi:2016:AEE,
  author =       "Anitha Gollamudi and Stephen Chong",
  title =        "Automatic enforcement of expressive security policies
                 using enclaves",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "494--513",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984002",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hardware-based enclave protection mechanisms, such as
                 Intel's SGX, ARM's TrustZone, and Apple's Secure
                 Enclave, can protect code and data from powerful
                 low-level attackers. In this work, we use enclaves to
                 enforce strong application-specific information
                 security policies. We present IMP$_E$, a novel calculus
                 that captures the essence of SGX-like enclave
                 mechanisms, and show that a security-type system for
                 IMP$_E$ can enforce expressive confidentiality policies
                 (including erasure policies and delimited release
                 policies) against powerful low-level attackers,
                 including attackers that can arbitrarily corrupt
                 non-enclave code, and, under some circumstances,
                 corrupt enclave code. We present a translation from an
                 expressive security-typed calculus (that is not aware
                 of enclaves) to IMP$_E$. The translation automatically
                 places code and data into enclaves to enforce the
                 security policies of the source program.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Colin:2016:CTC,
  author =       "Alexei Colin and Brandon Lucia",
  title =        "{Chain}: tasks and channels for reliable intermittent
                 programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "514--530",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983995",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Energy harvesting computers enable general-purpose
                 computing using energy collected from their
                 environment. Energy-autonomy of such devices has great
                 potential, but their intermittent power supply poses a
                 challenge. Intermittent program execution compromises
                 progress and leaves state inconsistent. This work
                 describes Chain: a new model for programming
                 intermittent devices. A Chain program is a set of
                 programmer-defined tasks that compute and exchange data
                 through channels. Chain guarantees forward progress at
                 task granularity. A task is restartable and never sees
                 inconsistent state, because its input and output
                 channels are separated. Our system supports language
                 features for expressing advanced data exchange patterns
                 and for encapsulating reusable functionality. Chain
                 fundamentally differs from state-of-the-art
                 checkpointing approaches and does not incur the
                 associated overhead. We implement Chain as C language
                 extensions and a runtime library. We used Chain to
                 implement four applications: machine learning,
                 encryption, compression, and sensing. In experiments,
                 Chain ensured consistency where prior approaches failed
                 and improved throughput by 2-7x over the leading
                 state-of-the-art system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Bonetta:2016:GSM,
  author =       "Daniele Bonetta and Luca Salucci and Stefan Marr and
                 Walter Binder",
  title =        "{GEMs}: shared-memory parallel programming for
                 {Node.js}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "531--547",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984039",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "JavaScript is the most popular programming language
                 for client-side Web applications, and Node.js has
                 popularized the language for server-side computing,
                 too. In this domain, the minimal support for parallel
                 programming remains however a major limitation. In this
                 paper we introduce a novel parallel programming
                 abstraction called Generic Messages (GEMs). GEMs allow
                 one to combine message passing and shared-memory
                 parallelism, extending the classes of parallel
                 applications that can be built with Node.js. GEMs have
                 customizable semantics and enable several forms of
                 thread safety, isolation, and concurrency control. GEMs
                 are designed as convenient JavaScript abstractions that
                 expose high-level and safe parallelism models to the
                 developer. Experiments show that GEMs outperform
                 equivalent Node.js applications thanks to their usage
                 of shared memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Peters:2016:OCF,
  author =       "Arthur Michener Peters and David Kitchin and John A.
                 Thywissen and William R. Cook",
  title =        "{OrcO}: a concurrency-first approach to objects",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "548--567",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984022",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The majority of modern programming languages provide
                 concurrency and object-orientation in some form.
                 However, object-oriented concurrency remains cumbersome
                 in many situations. We introduce the language OrcO, Orc
                 with concurrent Objects, which enables a flexible style
                 of concurrent object-oriented programming. OrcO extends
                 the Orc programming language by adding abstractions for
                 programming-in-the-large; namely objects, classes, and
                 inheritance. OrcO objects are designed to be orthogonal
                 to concurrency, allowing the concurrent structure and
                 object structure of a program to evolve independently.
                 This paper describes OrcO's goals and design and
                 provides examples of how OrcO can be used to deftly
                 handle events, object management, and object
                 composition.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Ancona:2016:SSI,
  author =       "Davide Ancona and Andrea Corradi",
  title =        "Semantic subtyping for imperative object-oriented
                 languages",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "568--587",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983992",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Semantic subtyping is an approach for defining sound
                 and complete procedures to decide subtyping for
                 expressive types, including union and intersection
                 types; although it has been exploited especially in
                 functional languages for XML based programming,
                 recently it has been partially investigated in the
                 context of object-oriented languages, and a sound and
                 complete subtyping algorithm has been proposed for
                 record types, but restricted to immutable fields, with
                 union and recursive types interpreted coinductively to
                 support cyclic objects. In this work we address the
                 problem of studying semantic subtyping for imperative
                 object-oriented languages, where fields can be mutable;
                 in particular, we add read/write field annotations to
                 record types, and, besides union, we consider
                 intersection types as well, while maintaining
                 coinductive interpretation of recursive types. In this
                 way, we get a richer notion of type with a flexible
                 subtyping relation, able to express a variety of type
                 invariants useful for enforcing static guarantees for
                 mutable objects. The addition of these features
                 radically changes the definition of subtyping, and,
                 hence, the corresponding decision procedure, and
                 surprisingly invalidates some subtyping laws that hold
                 in the functional setting. We propose an intuitive
                 model where mutable record values contain type
                 information to specify the values that can be correctly
                 stored in fields. Such a model, and the corresponding
                 subtyping rules, require particular care to avoid
                 circularity between coinductive judgments and their
                 negations which, by duality, have to be interpreted
                 inductively. A sound and complete subtyping algorithm
                 is provided, together with a prototype
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Brachthauser:2016:PFC,
  author =       "Jonathan Immanuel Brachth{\"a}user and Tillmann Rendel
                 and Klaus Ostermann",
  title =        "Parsing with first-class derivatives",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "588--606",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984026",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Brzozowski derivatives, well known in the context of
                 regular expressions, have recently been rediscovered to
                 give a simplified explanation to parsers of
                 context-free languages. We add derivatives as a novel
                 first-class feature to a standard parser combinator
                 language. First-class derivatives enable an inversion
                 of the control flow, allowing to implement modular
                 parsers for languages that previously required separate
                 pre-processing steps or cross-cutting modifications of
                 the parsers. We show that our framework offers new
                 opportunities for reuse and supports a modular
                 definition of interesting use cases of layout-sensitive
                 parsing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Kell:2016:MLE,
  author =       "Stephen Kell and Dominic P. Mulligan and Peter
                 Sewell",
  title =        "The missing link: explaining {ELF} static linking,
                 semantically",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "607--623",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983996",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Beneath the surface, software usually depends on
                 complex linker behaviour to work as intended. Even
                 linking {\tt hello\_world.c} is surprisingly involved,
                 and systems software such as {\tt libc} and operating
                 system kernels rely on a host of linker features. But
                 linking is poorly understood by working programmers and
                 has largely been neglected by language researchers. In
                 this paper we survey the many use-cases that linkers
                 support and the poorly specified linker speak by which
                 they are controlled: metadata in object files,
                 command-line options, and linker-script language. We
                 provide the first validated formalisation of a
                 realistic executable and linkable format (ELF), and
                 capture aspects of the Application Binary Interfaces
                 for four mainstream platforms (AArch64, AMD64, Power64,
                 and IA32). Using these, we develop an executable
                 specification of static linking, covering (among other
                 things) enough to link small C programs (we use the
                 example of bzip2) into a correctly running executable.
                 We provide our specification in Lem and Isabelle/HOL
                 forms. This is the first formal specification of
                 mainstream linking. We have used the Isabelle/HOL
                 version to prove a sample correctness property for one
                 case of AMD64 ABI relocation, demonstrating that the
                 specification supports formal proof, and as a first
                 step towards the much more ambitious goal of verified
                 linking. Our work should enable several novel strands
                 of research, including linker-aware verified
                 compilation and program analysis, and better languages
                 for controlling linking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Rompf:2016:TSD,
  author =       "Tiark Rompf and Nada Amin",
  title =        "Type soundness for dependent object types {(DOT)}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "624--641",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984008",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scala's type system unifies aspects of ML modules,
                 object-oriented, and functional programming. The
                 Dependent Object Types (DOT) family of calculi has been
                 proposed as a new theoretic foundation for Scala and
                 similar expressive languages. Unfortunately, type
                 soundness has only been established for restricted
                 subsets of DOT. In fact, it has been shown that
                 important Scala features such as type refinement or a
                 subtyping relation with lattice structure break at
                 least one key metatheoretic property such as
                 environment narrowing or invertible subtyping
                 transitivity, which are usually required for a type
                 soundness proof. The main contribution of this paper is
                 to demonstrate how, perhaps surprisingly, even though
                 these properties are lost in their full generality, a
                 rich DOT calculus that includes recursive type
                 refinement and a subtyping lattice with intersection
                 types can still be proved sound. The key insight is
                 that subtyping transitivity only needs to be invertible
                 in code paths executed at runtime, with contexts
                 consisting entirely of valid runtime objects, whereas
                 inconsistent subtyping contexts can be permitted for
                 code that is never executed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Daloze:2016:ETS,
  author =       "Benoit Daloze and Stefan Marr and Daniele Bonetta and
                 Hanspeter M{\"o}ssenb{\"o}ck",
  title =        "Efficient and thread-safe objects for
                 dynamically-typed languages",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "642--659",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984001",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We are in the multi-core era. Dynamically-typed
                 languages are in widespread use, but their support for
                 multithreading still lags behind. One of the reasons is
                 that the sophisticated techniques they use to
                 efficiently represent their dynamic object models are
                 often unsafe in multithreaded environments. This paper
                 defines safety requirements for dynamic object models
                 in multithreaded environments. Based on these
                 requirements, a language-agnostic and thread-safe
                 object model is designed that maintains the efficiency
                 of sequential approaches. This is achieved by ensuring
                 that field reads do not require synchronization and
                 field updates only need to synchronize on objects
                 shared between threads. Basing our work on
                 JRuby+Truffle, we show that our safe object model has
                 zero overhead on peak performance for thread-local
                 objects and only 3\% average overhead on parallel
                 benchmarks where field updates require synchronization.
                 Thus, it can be a foundation for safe and efficient
                 multithreaded VMs for a wide range of dynamic
                 languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Chapman:2016:HSH,
  author =       "Keith Chapman and Antony L. Hosking and J. Eliot B.
                 Moss",
  title =        "Hybrid {STM\slash HTM} for nested transactions on
                 {OpenJDK}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "660--676",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984029",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactional memory (TM) has long been advocated as a
                 promising pathway to more automated concurrency control
                 for scaling concurrent programs running on parallel
                 hardware. Software TM (STM) has the benefit of being
                 able to run general transactional programs, but at the
                 significant cost of overheads imposed to log memory
                 accesses, mediate access conflicts, and maintain other
                 transaction metadata. Recently, hardware manufacturers
                 have begun to offer commodity hardware TM (HTM) support
                 in their processors wherein the transaction metadata is
                 maintained ``for free'' in hardware. However, HTM
                 approaches are only best-effort: they cannot
                 successfully run all transactional programs, whether
                 because of hardware capacity issues (causing large
                 transactions to fail), or compatibility restrictions on
                 the processor instructions permitted within hardware
                 transactions (causing transactions that execute those
                 instructions to fail). In such cases, programs must
                 include failure-handling code to attempt the
                 computation by some other software means, since
                 retrying the transaction would be futile. Thus, a
                 canonical use of HTM is lock elision: replacing lock
                 regions with transactions, retrying some number of
                 times in the case of conflicts, but falling back to
                 locking when HTM fails for other reasons. Here, we
                 describe how software and hardware schemes can combine
                 seamlessly into a hybrid system in support of
                 transactional programs, allowing use of low-cost HTM
                 when it works, but reverting to STM when it doesn't. We
                 describe heuristics used to make this choice
                 dynamically and automatically, but allowing the
                 transition back to HTM opportunistically. Our
                 implementation is for an extension of Java having
                 syntax for both open and closed nested transactions,
                 and boosting, running on the OpenJDK, with dynamic
                 injection of STM mechanisms (into code variants used
                 under STM) and HTM instructions (into code variants
                 used under HTM). Both schemes are compatible to allow
                 different threads to run concurrently with either
                 mechanism, while preserving transaction safety. Using a
                 standard synthetic benchmark we demonstrate that HTM
                 offers significant acceleration of both closed and open
                 nested transactions, while yielding parallel scaling up
                 to the limits of the hardware, whereupon scaling in
                 software continues but with the penalty to throughput
                 imposed by software mechanisms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Bhandari:2016:MFR,
  author =       "Kumud Bhandari and Dhruva R. Chakrabarti and Hans-J.
                 Boehm",
  title =        "{Makalu}: fast recoverable allocation of non-volatile
                 memory",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "677--694",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984019",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Byte addressable non-volatile memory (NVRAM) is likely
                 to supplement, and perhaps eventually replace, DRAM.
                 Applications can then persist data structures directly
                 in memory instead of serializing them and storing them
                 onto a durable block device. However, failures during
                 execution can leave data structures in NVRAM
                 unreachable or corrupt. In this paper, we present
                 Makalu, a system that addresses non-volatile memory
                 management. Makalu offers an integrated allocator and
                 recovery-time garbage collector that maintains internal
                 consistency, avoids NVRAM memory leaks, and is
                 efficient, all in the face of failures. We show that a
                 careful allocator design can support a less restrictive
                 and a much more familiar programming model than
                 existing persistent memory allocators. Our allocator
                 significantly reduces the per allocation persistence
                 overhead by lazily persisting non-essential metadata
                 and by employing a post-failure recovery-time garbage
                 collector. Experimental results show that the resulting
                 online speed and scalability of our allocator are
                 comparable to well-known transient allocators, and
                 significantly better than state-of-the-art persistent
                 allocators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Nunez:2016:PGC,
  author =       "Diogenes Nunez and Samuel Z. Guyer and Emery D.
                 Berger",
  title =        "Prioritized garbage collection: explicit {GC} support
                 for software caches",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "695--710",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984028",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmers routinely trade space for time to increase
                 performance, often in the form of caching or
                 memoization. In managed languages like Java or
                 JavaScript, however, this space-time tradeoff is
                 complex. Using more space translates into higher
                 garbage collection costs, especially at the limit of
                 available memory. Existing runtime systems provide
                 limited support for space-sensitive algorithms, forcing
                 programmers into difficult and often brittle choices
                 about provisioning. This paper presents prioritized
                 garbage collection, a cooperative programming language
                 and runtime solution to this problem. Prioritized GC
                 provides an interface similar to soft references,
                 called priority references, which identify objects that
                 the collector can reclaim eagerly if necessary. The key
                 difference is an API for defining the policy that
                 governs when priority references are cleared and in
                 what order. Application code specifies a priority value
                 for each reference and a target memory bound. The
                 collector reclaims references, lowest priority first,
                 until the total memory footprint of the cache fits
                 within the bound. We use this API to implement a
                 space-aware least-recently-used (LRU) cache, called a
                 Sache, that is a drop-in replacement for existing
                 caches, such as Google's Guava library. The garbage
                 collector automatically grows and shrinks the Sache in
                 response to available memory and workload with minimal
                 provisioning information from the programmer. Using a
                 Sache, it is almost impossible for an application to
                 experience a memory leak, memory pressure, or an
                 out-of-memory crash caused by software caching.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Steimann:2016:CRA,
  author =       "Friedrich Steimann and J{\"o}rg Hagemann and Bastian
                 Ulke",
  title =        "Computing repair alternatives for malformed programs
                 using constraint attribute grammars",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "711--730",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984007",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Attribute grammars decorate the nodes of a program's
                 parse tree with attributes whose values are defined by
                 equations encoding the (static) semantics of a
                 programming language. We show how replacing the
                 equations of an attribute grammar with equivalent
                 constraints that can be solved by a constraint solver
                 allows us to compute repairs of a malformed program
                 solely from a specification that was originally
                 designed for checking its well-formedness. We present
                 two repair modes --- shallow and deep fixing --- whose
                 computed repair alternatives are guaranteed to repair
                 every error on which they are invoked. While shallow
                 fixing may introduce new errors, deep fixing never
                 does; to make it tractable, we implement it using
                 neighborhood search. We demonstrate the feasibility of
                 our approach by implementing it on top of ExtendJ, an
                 attribute grammar based Java compiler, and by applying
                 it to an example from the Java EE context, detecting
                 and fixing well-formedness errors (both real and
                 injected) in a body of 14 open-source subject
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Raychev:2016:PMC,
  author =       "Veselin Raychev and Pavol Bielik and Martin Vechev",
  title =        "Probabilistic model for code with decision trees",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "731--747",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984041",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper we introduce a new approach for learning
                 precise and general probabilistic models of code based
                 on decision tree learning. Our approach directly
                 benefits an emerging class of statistical programming
                 tools which leverage probabilistic models of code
                 learned over large codebases (e.g., GitHub) to make
                 predictions about new programs (e.g., code completion,
                 repair, etc). The key idea is to phrase the problem of
                 learning a probabilistic model of code as learning a
                 decision tree in a domain specific language over
                 abstract syntax trees (called TGen). This allows us to
                 condition the prediction of a program element on a
                 dynamically computed context. Further, our problem
                 formulation enables us to easily instantiate known
                 decision tree learning algorithms such as ID3, but also
                 to obtain new variants we refer to as ID3+ and E13, not
                 previously explored and ones that outperform ID3 in
                 prediction accuracy. Our approach is general and can be
                 used to learn a probabilistic model of any programming
                 language. We implemented our approach in a system
                 called Deep3 and evaluated it for the challenging task
                 of learning probabilistic models of JavaScript and
                 Python. Our experimental results indicate that Deep3
                 predicts elements of JavaScript and Python code with
                 precision above 82\% and 69\%, respectively. Further,
                 Deep3 often significantly outperforms state-of-the-art
                 approaches in overall prediction accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Barman:2016:RWA,
  author =       "Shaon Barman and Sarah Chasins and Rastislav Bodik and
                 Sumit Gulwani",
  title =        "{Ringer}: web automation by demonstration",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "748--764",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984020",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With increasing amounts of data available on the web
                 and a diverse range of users interested in
                 programmatically accessing that data, web automation
                 must become easier. Automation helps users complete
                 many tedious interactions, such as scraping data,
                 completing forms, or transferring data between
                 websites. However, writing web automation scripts
                 typically requires an expert programmer because the
                 writer must be able to reverse engineer the target
                 webpage. We have built a record and replay tool,
                 Ringer, that makes web automation accessible to
                 non-coders. Ringer takes a user demonstration as input
                 and creates a script that interacts with the page as a
                 user would. This approach makes Ringer scripts more
                 robust to webpage changes because user-facing
                 interfaces remain relatively stable compared to the
                 underlying webpage implementations. We evaluated our
                 approach on benchmarks recorded on real webpages and
                 found that it replayed 4x more benchmarks than a
                 state-of-the-art replay tool.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Weitz:2016:SVB,
  author =       "Konstantin Weitz and Doug Woos and Emina Torlak and
                 Michael D. Ernst and Arvind Krishnamurthy and Zachary
                 Tatlock",
  title =        "Scalable verification of {Border Gateway Protocol}
                 configurations with an {SMT} solver",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "765--780",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984012",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Internet Service Providers (ISPs) use the Border
                 Gateway Protocol (BGP) to announce and exchange routes
                 for delivering packets through the internet. ISPs must
                 carefully configure their BGP routers to ensure traffic
                 is routed reliably and securely. Correctly configuring
                 BGP routers has proven challenging in practice, and
                 misconfiguration has led to worldwide outages and
                 traffic hijacks. This paper presents Bagpipe, a system
                 that enables ISPs to declaratively express BGP policies
                 and that automatically verifies that router
                 configurations implement such policies. The novel
                 initial network reduction soundly reduces policy
                 verification to a search for counterexamples in a
                 finite space. An SMT-based symbolic execution engine
                 performs this search efficiently. Bagpipe reduces the
                 size of its search space using predicate abstraction
                 and parallelizes its search using symbolic variable
                 hoisting. Bagpipe's policy specification language is
                 expressive: we expressed policies inferred from real AS
                 configurations, policies from the literature, and
                 policies for 10 Juniper TechLibrary configuration
                 scenarios. Bagpipe is efficient: we ran it on three
                 ASes with a total of over 240,000 lines of Cisco and
                 Juniper BGP configuration. Bagpipe is effective: it
                 revealed 19 policy violations without issuing any false
                 positives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Loncaric:2016:PFT,
  author =       "Calvin Loncaric and Satish Chandra and Cole
                 Schlesinger and Manu Sridharan",
  title =        "A practical framework for type inference error
                 explanation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "781--799",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983994",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many languages have support for automatic type
                 inference. But when inference fails, the reported error
                 messages can be unhelpful, highlighting a code location
                 far from the source of the problem. Several lines of
                 work have emerged proposing error reports derived from
                 correcting sets: a set of program points that, when
                 fixed, produce a well-typed program. Unfortunately,
                 these approaches are tightly tied to specific
                 languages; targeting a new language requires encoding a
                 type inference algorithm for the language in a custom
                 constraint system specific to the error reporting tool.
                 We show how to produce correcting set-based error
                 reports by leveraging existing type inference
                 implementations, easing the burden of adoption and, as
                 type inference algorithms tend to be efficient in
                 practice, producing error reports of comparable quality
                 to similar error reporting tools orders of magnitude
                 faster. Many type inference algorithms are already
                 formulated as dual phases of type constraint generation
                 and solving; rather than (re)implementing type
                 inference in an error explanation tool, we isolate the
                 solving phase and treat it as an oracle for solving
                 typing constraints. Given any set of typing
                 constraints, error explanation proceeds by iteratively
                 removing conflicting constraints from the initial
                 constraint set until discovering a subset on which the
                 solver succeeds; the constraints removed form a
                 correcting set. Our approach is agnostic to the
                 semantics of any particular language or type system,
                 instead leveraging the existing type inference engine
                 to give meaning to constraints.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Kell:2016:DDT,
  author =       "Stephen Kell",
  title =        "Dynamically diagnosing type errors in unsafe code",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "800--819",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2983998",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Existing approaches for detecting type errors in
                 unsafe languages are limited. Static analysis methods
                 are imprecise, and often require source-level changes,
                 while most dynamic methods check only memory properties
                 (bounds, liveness, etc.), owing to a lack of run-time
                 type information. This paper describes libcrunch, a
                 system for binary-compatible run-time type checking of
                 unmodified unsafe code, currently focusing on C.
                 Practical experience shows that our prototype
                 implementation is easily applicable to many real
                 codebases without source-level modification, correctly
                 flags programmer errors with a very low rate of false
                 positives, offers a very low run-time overhead, and
                 covers classes of error caught by no previously
                 existing tool.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Long:2016:FCE,
  author =       "Yuheng Long and Yu David Liu and Hridesh Rajan",
  title =        "First-class effect reflection for effect-guided
                 programming",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "820--837",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984037",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces a novel type-and-effect
                 calculus, first-class effects, where the computational
                 effect of an expression can be programmatically
                 reflected, passed around as values, and analyzed at run
                 time. A broad range of designs ``hard-coded'' in
                 existing effect-guided analyses --- from thread
                 scheduling, version-consistent software updating, to
                 data zeroing --- can be naturally supported through the
                 programming abstractions. The core technical
                 development is a type system with a number of features,
                 including a hybrid type system that integrates static
                 and dynamic effect analyses, a refinement type system
                 to verify application-specific effect management
                 properties, a double-bounded type system that computes
                 both over-approximation of effects and their
                 under-approximation. We introduce and establish a
                 notion of soundness called trace consistency, defined
                 in terms of how the effect and trace correspond. The
                 property sheds foundational insight on ``good''
                 first-class effect programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Amin:2016:JST,
  author =       "Nada Amin and Ross Tate",
  title =        "{Java} and {Scala}'s type systems are unsound: the
                 existential crisis of null pointers",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "838--848",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984004",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present short programs that demonstrate the
                 unsoundness of Java and Scala's current type systems.
                 In particular, these programs provide parametrically
                 polymorphic functions that can turn any type into any
                 type without (down)casting. Fortunately, parametric
                 polymorphism was not integrated into the Java Virtual
                 Machine (JVM), so these examples do not demonstrate any
                 unsoundness of the JVM. Nonetheless, we discuss broader
                 implications of these findings on the field of
                 programming languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Sun:2016:FCB,
  author =       "Chengnian Sun and Vu Le and Zhendong Su",
  title =        "Finding compiler bugs via live code mutation",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "849--863",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984038",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Validating optimizing compilers is challenging because
                 it is hard to generate valid test programs (i.e., those
                 that do not expose any undefined behavior). Equivalence
                 Modulo Inputs (EMI) is an effective, promising
                 methodology to tackle this problem. Given a test
                 program with some inputs, EMI mutates the program to
                 derive variants that are semantically equivalent w.r.t.
                 these inputs. The state-of-the-art instantiations of
                 EMI are Orion and Athena, both of which rely on
                 deleting code from or inserting code into code regions
                 that are not executed under the inputs. Although both
                 have demonstrated their ability in finding many bugs in
                 GCC and LLVM, they are still limited due to their
                 mutation strategies that operate only on dead code
                 regions. This paper presents a novel EMI technique that
                 allows mutation in the entire program (i.e., both live
                 and dead regions). By removing the restriction of
                 mutating only the dead regions, our technique
                 significantly increases the EMI variant space. It also
                 helps to more thoroughly stress test compilers as
                 compilers must optimize mutated live code, whereas
                 mutated dead code might be eliminated. Finally, our
                 technique also makes compiler bugs more noticeable as
                 miscompilations on mutated dead code may not be
                 observable. We have realized the proposed technique in
                 Hermes. The evaluation demonstrates Hermes's
                 effectiveness. In 13 months, Hermes found 168
                 confirmed, valid bugs in GCC and LLVM, of which 132
                 have already been fixed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Shan:2016:FRR,
  author =       "Zhiyong Shan and Tanzirul Azim and Iulian Neamtiu",
  title =        "Finding resume and restart errors in {Android}
                 applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "864--880",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984011",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Smartphone apps create and handle a large variety of
                 ``instance'' data that has to persist across runs, such
                 as the current navigation route, workout results,
                 antivirus settings, or game state. Due to the nature of
                 the smartphone platform, an app can be paused, sent
                 into background, or killed at any time. If the instance
                 data is not saved and restored between runs, in
                 addition to data loss, partially-saved or corrupted
                 data can crash the app upon resume or restart. While
                 smartphone platforms offer API support for data-saving
                 and data-retrieving operations, the use of this API is
                 ad-hoc: left to the programmer, rather than enforced by
                 the compiler. We have observed that several categories
                 of bugs---including data loss, failure to
                 resume/restart or resuming/restarting in the wrong
                 state---are due to incorrect handling of instance data
                 and are easily triggered by just pressing the `Home' or
                 `Back' buttons. To help address this problem, we have
                 constructed a tool chain for Android (the KREfinder
                 static analysis and the KREreproducer input generator)
                 that helps find and reproduce such incorrect handling.
                 We have evaluated our approach by running the static
                 analysis on 324 apps, of which 49 were further analyzed
                 manually. Results indicate that our approach is (i)
                 effective, as it has discovered 49 bugs, including in
                 popular Android apps, and (ii) efficient, completing on
                 average in 61 seconds per app. More generally, our
                 approach helps determine whether an app saves too much
                 or too little state.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Zuo:2016:LOF,
  author =       "Zhiqiang Zuo and Lu Fang and Siau-Cheng Khoo and
                 Guoqing Xu and Shan Lu",
  title =        "Low-overhead and fully automated statistical debugging
                 with abstraction refinement",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "881--896",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984005",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cooperative statistical debugging is an effective
                 approach for diagnosing production-run failures. To
                 quickly identify failure predictors from the huge
                 program predicate space, existing techniques rely on
                 random or heuristics-guided predicate sampling at the
                 user side. However, none of them can satisfy the
                 requirements of low cost, low diagnosis latency, and
                 high diagnosis quality simultaneously, which are all
                 indispensable for statistical debugging to be
                 practical. This paper presents a new technique that
                 tackles the above challenges. We formulate the
                 technique as an instance of abstraction refinement,
                 where efficient abstract-level profiling is first
                 applied to the whole program and its execution brings
                 information that can pinpoint suspicious coarse-grained
                 entities that need to be refined. The refinement
                 profiles a corresponding set of fine-grained entities,
                 and generates feedback that determines what to prune
                 and what to refine next. The process is fully
                 automated, and more importantly, guided by a
                 mathematically rigorous analysis that guarantees that
                 our approach produces the same debugging results as an
                 exhaustive analysis in deterministic settings. We have
                 implemented this technique for both C and Java on both
                 single machine and distributed system. A thorough
                 evaluation demonstrates that our approach yields (1) an
                 order of magnitude reduction in the user-side runtime
                 overhead even compared to a sampling-based approach and
                 (2) two orders of magnitude reduction in the size of
                 data transferred over the network, completely
                 automatically without sacrificing any debugging
                 capability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Bavishi:2016:PRA,
  author =       "Rohan Bavishi and Awanish Pandey and Subhajit Roy",
  title =        "To be precise: regression aware debugging",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "10",
  pages =        "897--915",
  month =        oct,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022671.2984014",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Bounded model checking based debugging solutions
                 search for mutations of program expressions that
                 produce the expected output for a currently failing
                 test. However, the current localization tools are not
                 regression aware: they do not use information from the
                 passing tests in their localization formula. On the
                 other hand, the current repair tools attempt to
                 guarantee regression freedom: when provided with a set
                 of passing tests, they guarantee that none of these
                 tests can break due to the suggested repair patch,
                 thereby constructing a large repair formula. In this
                 paper, we propose regression awareness as a means to
                 improve the quality of localization and to scale
                 repair. To enable regression awareness, we summarize
                 the proof of correctness of each passing test by
                 computing Craig Interpolants over a symbolic encoding
                 of the passing execution, and use these summaries as
                 additional soft constraints while synthesizing altered
                 executions corresponding to failing tests. Intuitively,
                 these additional constraints act as roadblocks, thereby
                 discouraging executions that may damage the proof of a
                 passing test. We use a partial MAXSAT solver to relax
                 the proofs in a systematic way, and use a ranking
                 function that penalizes mutations that damage the
                 existing proofs. We have implemented our algorithms
                 into a tool, TINTIN, that enables regression aware
                 localization and repair. For localizations, our
                 strategy is effective in extracting a superior ranking
                 of suspicious locations: on a set of 52 different
                 versions across 12 different programs spanning three
                 benchmark suites, TINTIN achieves a saving of developer
                 effort by almost 45\% (in terms of the locations that
                 must be examined by a developer to reach the
                 ground-truth repair) in the worst case and 27\% in the
                 average case over existing techniques. For automated
                 repairs, on our set of benchmarks, TINTIN achieves a
                 2.3X speedup over existing techniques without
                 sacrificing much on the ranking of the repair patches:
                 the ground-truth repair appears as the topmost
                 suggestion in more than 70\% of our benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "OOPSLA '16 conference proceedings.",
}

@Article{Lifflander:2017:CLO,
  author =       "Jonathan Lifflander and Sriram Krishnamoorthy",
  title =        "Cache locality optimization for recursive programs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "1--16",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062385",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an approach to optimize the cache locality
                 for recursive programs by dynamically
                 splicing---recursively interleaving---the execution of
                 distinct function invocations. By utilizing data effect
                 annotations, we identify concurrency and data reuse
                 opportunities across function invocations and
                 interleave them to reduce reuse distance. We present
                 algorithms that efficiently track effects in recursive
                 programs, detect interference and dependencies, and
                 interleave execution of function invocations using
                 user-level (non-kernel) lightweight threads. To enable
                 multi-core execution, a program is parallelized using a
                 nested fork/join programming model. Our cache
                 optimization strategy is designed to work in the
                 context of a random work stealing scheduler. We present
                 an implementation using the MIT Cilk framework that
                 demonstrates significant improvements in sequential and
                 parallel performance, competitive with a
                 state-of-the-art compile-time optimizer for loop
                 programs and a domain-specific optimizer for stencil
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Saarikivi:2017:FEC,
  author =       "Olli Saarikivi and Margus Veanes and Todd Mytkowicz
                 and Madan Musuvathi",
  title =        "Fusing effectful comprehensions",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "17--32",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062362",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "List comprehensions provide a powerful abstraction
                 mechanism for expressing computations over ordered
                 collections of data declaratively without having to use
                 explicit iteration constructs. This paper puts forth
                 effectful comprehensions as an elegant way to describe
                 list comprehensions that incorporate loop-carried
                 state. This is motivated by operations such as
                 compression/decompression and
                 serialization/deserialization that are common in
                 log/data processing pipelines and require loop-carried
                 state when processing an input stream of data. We build
                 on the underlying theory of symbolic transducers to
                 fuse pipelines of effectful comprehensions into a
                 single representation, from which efficient code can be
                 generated. Using background theory reasoning with an
                 SMT solver, our fusion and subsequent reachability
                 based branch elimination algorithms can significantly
                 reduce the complexity of the fused pipelines. Our
                 implementation shows significant speedups over
                 reasonable hand-written code (3.4 $ \times $, on
                 average) and traditionally fused version of the
                 pipeline (2.6 $ \times $, on average) for a variety of
                 examples, including scenarios for extracting fields
                 with regular expressions, processing XML with XPath,
                 and running queries over encoded data.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Ding:2017:GTD,
  author =       "Yufei Ding and Lin Ning and Hui Guan and Xipeng Shen",
  title =        "Generalizations of the theory and deployment of
                 triangular inequality for compiler-based strength
                 reduction",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "33--48",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062377",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Triangular Inequality (TI) has been used in many
                 manual algorithm designs to achieve good efficiency in
                 solving some distance calculation-based problems. This
                 paper presents our generalization of the idea into a
                 compiler optimization technique, named TI-based
                 strength reduction. The generalization consists of
                 three parts. The first is the establishment of the
                 theoretic foundation of this new optimization via the
                 development of a new form of TI named Angular
                 Triangular Inequality, along with several fundamental
                 theorems. The second is the revealing of the properties
                 of the new forms of TI and the proposal of guided TI
                 adaptation, a systematic method to address the
                 difficulties in effective deployments of TI
                 optimizations. The third is an integration of the new
                 optimization technique in an open-source compiler.
                 Experiments on a set of data mining and machine
                 learning algorithms show that the new technique can
                 speed up the standard implementations by as much as
                 134X and 46X on average for distance-related problems,
                 outperforming previous TI-based optimizations by 2.35X
                 on average. It also extends the applicability of
                 TI-based optimizations to vector related problems,
                 producing tens of times of speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Menendez:2017:AID,
  author =       "David Menendez and Santosh Nagarakatte",
  title =        "{Alive-Infer}: data-driven precondition inference for
                 peephole optimizations in {LLVM}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "49--63",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062372",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Peephole optimizations are a common source of compiler
                 bugs. Compiler developers typically transform an
                 incorrect peephole optimization into a valid one by
                 strengthening the precondition. This process is
                 challenging and tedious. This paper proposes
                 Alive-Infer, a data-driven approach that infers
                 preconditions for peephole optimizations expressed in
                 Alive. Alive-Infer generates positive and negative
                 examples for an optimization, enumerates predicates
                 on-demand, and learns a set of predicates that separate
                 the positive and negative examples. Alive-Infer repeats
                 this process until it finds a precondition that ensures
                 the validity of the optimization. Alive-Infer reports
                 both a weakest precondition and a set of succinct
                 partial preconditions to the developer. Our prototype
                 generates preconditions that are weaker than LLVM's
                 preconditions for 73 optimizations in the Alive suite.
                 We also demonstrate the applicability of this technique
                 to generalize 54 optimization patterns generated by
                 Souper, an LLVM IR-based superoptimizer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Yessenov:2017:DAD,
  author =       "Kuat Yessenov and Ivan Kuraj and Armando
                 Solar-Lezama",
  title =        "{DemoMatch}: {API} discovery from demonstrations",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "64--78",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062386",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce DemoMatch, a tool for API discovery that
                 allows the user to discover how to implement
                 functionality using a software framework by
                 demonstrating the functionality in existing
                 applications built with the same framework. DemoMatch
                 matches the demonstrations against a database of
                 execution traces called Semeru and generates code
                 snippets explaining how to use the functionality. We
                 evaluated DemoMatch on several case studies involving
                 Java Swing and Eclipse RCP.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{David:2017:SBT,
  author =       "Yaniv David and Nimrod Partush and Eran Yahav",
  title =        "Similarity of binaries through re-optimization",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "79--94",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062387",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a scalable approach for establishing
                 similarity between stripped binaries (with no debug
                 information). The main challenge in binary similarity,
                 is to establish similarity even when the code has been
                 compiled using different compilers, with different
                 optimization levels, or targeting different
                 architectures. Overcoming this challenge, while
                 avoiding false positives, is invaluable to the process
                 of reverse engineering and the process of locating
                 vulnerable code. We present a technique that is
                 scalable and precise, as it alleviates the need for
                 heavyweight semantic comparison by performing
                 out-of-context re-optimization of procedure fragments.
                 It works by decomposing binary procedures to comparable
                 fragments and transforming them to a canonical,
                 normalized form using the compiler optimizer, which
                 enables finding equivalent fragments through simple
                 syntactic comparison. We use a statistical framework
                 built by analyzing samples collected ``in the wild'' to
                 generate a global context that quantifies the
                 significance of each pair of fragments, and uses it to
                 lift pairwise fragment equivalence to whole procedure
                 similarity. We have implemented our technique in a tool
                 called {\tt GitZ} and performed an extensive
                 evaluation. We show that {\tt GitZ} is able to perform
                 millions of comparisons efficiently, and find
                 similarity with high accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Bastani:2017:SPI,
  author =       "Osbert Bastani and Rahul Sharma and Alex Aiken and
                 Percy Liang",
  title =        "Synthesizing program input grammars",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "95--110",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062349",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an algorithm for synthesizing a
                 context-free grammar encoding the language of valid
                 program inputs from a set of input examples and
                 blackbox access to the program. Our algorithm addresses
                 shortcomings of existing grammar inference algorithms,
                 which both severely overgeneralize and are
                 prohibitively slow. Our implementation, GLADE,
                 leverages the grammar synthesized by our algorithm to
                 fuzz test programs with structured inputs. We show that
                 GLADE substantially increases the incremental coverage
                 on valid inputs compared to two baseline fuzzers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Huang:2017:CMC,
  author =       "Daniel Huang and Jean-Baptiste Tristan and Greg
                 Morrisett",
  title =        "Compiling {Markov} chain {Monte Carlo} algorithms for
                 probabilistic modeling",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "111--125",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062375",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The problem of probabilistic modeling and inference,
                 at a high-level, can be viewed as constructing a (
                 model, query, inference ) tuple, where an inference
                 algorithm implements a query on a model. Notably, the
                 derivation of inference algorithms can be a difficult
                 and error-prone task. Hence, researchers have explored
                 how ideas from probabilistic programming can be
                 applied. In the context of constructing these tuples,
                 probabilistic programming can be seen as taking a
                 language-based approach to probabilistic modeling and
                 inference. For instance, by using (1) appropriate
                 languages for expressing models and queries and (2)
                 devising inference techniques that operate on encodings
                 of models (and queries) as program expressions, the
                 task of inference can be automated. In this paper, we
                 describe a compiler that transforms a probabilistic
                 model written in a restricted modeling language and a
                 query for posterior samples given observed data into a
                 Markov Chain Monte Carlo (MCMC) inference algorithm
                 that implements the query. The compiler uses a sequence
                 of intermediate languages (ILs) that guide it in
                 gradually and successively refining a declarative
                 specification of a probabilistic model and the query
                 into an executable MCMC inference algorithm. The
                 compilation strategy produces composable MCMC
                 algorithms for execution on a CPU or GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Eizenberg:2017:BBL,
  author =       "Ariel Eizenberg and Yuanfeng Peng and Toma Pigli and
                 William Mansky and Joseph Devietti",
  title =        "{BARRACUDA}: binary-level analysis of runtime {RAces}
                 in {CUDA} programs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "126--140",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062342",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "GPU programming models enable and encourage massively
                 parallel programming with over a million threads,
                 requiring extreme parallelism to achieve good
                 performance. Massive parallelism brings significant
                 correctness challenges by increasing the possibility
                 for bugs as the number of thread interleavings
                 balloons. Conventional dynamic safety analyses struggle
                 to run at this scale. We present BARRACUDA, a
                 concurrency bug detector for GPU programs written in
                 Nvidia's CUDA language. BARRACUDA handles a wider range
                 of parallelism constructs than previous work, including
                 branch operations, low-level atomics and memory fences,
                 which allows BARRACUDA to detect new classes of
                 concurrency bugs. BARRACUDA operates at the binary
                 level for increased compatibility with existing code,
                 leveraging a new binary instrumentation framework that
                 is extensible to other dynamic analyses. BARRACUDA
                 incorporates a number of novel optimizations that are
                 crucial for scaling concurrency bug detection to over a
                 million threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Rhodes:2017:BSC,
  author =       "Dustin Rhodes and Cormac Flanagan and Stephen N.
                 Freund",
  title =        "{BigFoot}: static check placement for dynamic race
                 detection",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "141--156",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062350",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Precise dynamic data race detectors provide strong
                 correctness guarantees but have high overheads because
                 they generally keep analysis state in a separate shadow
                 location for each heap memory location, and they check
                 (and potentially update) the corresponding shadow
                 location on each heap access. The BigFoot dynamic data
                 race detector uses a combination of static and dynamic
                 analysis techniques to coalesce checks and compress
                 shadow locations. With BigFoot, multiple accesses to an
                 object or array often induce a single coalesced check
                 that manipulates a single compressed shadow location,
                 resulting in a performance improvement over FastTrack
                 of 61\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Kini:2017:DRP,
  author =       "Dileep Kini and Umang Mathur and Mahesh Viswanathan",
  title =        "Dynamic race prediction in linear time",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "157--170",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062374",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Writing reliable concurrent software remains a huge
                 challenge for today's programmers. Programmers rarely
                 reason about their code by explicitly considering
                 different possible inter-leavings of its execution. We
                 consider the problem of detecting data races from
                 individual executions in a sound manner. The classical
                 approach to solving this problem has been to use
                 Lamport's happens-before (HB) relation. Until now HB
                 remains the only approach that runs in linear time.
                 Previous efforts in improving over HB such as
                 causally-precedes (CP) and maximal causal models fall
                 short due to the fact that they are not implementable
                 efficiently and hence have to compromise on their race
                 detecting ability by limiting their techniques to
                 bounded sized fragments of the execution. We present a
                 new relation weak-causally-precedes (WCP) that is
                 provably better than CP in terms of being able to
                 detect more races, while still remaining sound.
                 Moreover, it admits a linear time algorithm which works
                 on the entire execution without having to fragment
                 it.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Billes:2017:SBB,
  author =       "Marina Billes and Anders M{\o}ller and Michael
                 Pradel",
  title =        "Systematic black-box analysis of collaborative web
                 applications",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "171--184",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062364",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Web applications, such as collaborative editors that
                 allow multiple clients to concurrently interact on a
                 shared resource, are difficult to implement correctly.
                 Existing techniques for analyzing concurrent software
                 do not scale to such complex systems or do not consider
                 multiple interacting clients. This paper presents
                 Simian, the first fully automated technique for
                 systematically analyzing multi-client web applications.
                 Naively exploring all possible interactions between a
                 set of clients of such applications is practically
                 infeasible. Simian obtains scalability for real-world
                 applications by using a two-phase black-box approach.
                 The application code remains unknown to the analysis
                 and is first explored systematically using a single
                 client to infer potential conflicts between client
                 events triggered in a specific context. The second
                 phase synthesizes multi-client interactions targeted at
                 triggering misbehavior that may result from the
                 potential conflicts, and reports an inconsistency if
                 the clients do not converge to a consistent state. We
                 evaluate the analysis on three widely used systems,
                 Google Docs, Firepad, and ownCloud Documents, where it
                 reports a variety of inconsistencies, such as incorrect
                 formatting and misplaced text fragments. Moreover, we
                 find that the two-phase approach runs 10x faster
                 compared to exhaustive exploration, making systematic
                 analysis practically applicable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Haas:2017:BWS,
  author =       "Andreas Haas and Andreas Rossberg and Derek L. Schuff
                 and Ben L. Titzer and Michael Holman and Dan Gohman and
                 Luke Wagner and Alon Zakai and J. F. Bastien",
  title =        "Bringing the web up to speed with {WebAssembly}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "185--200",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062363",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The maturation of the Web platform has given rise to
                 sophisticated and demanding Web applications such as
                 interactive 3D visualization, audio and video software,
                 and games. With that, efficiency and security of code
                 on the Web has become more important than ever. Yet
                 JavaScript as the only built-in language of the Web is
                 not well-equipped to meet these requirements,
                 especially as a compilation target. Engineers from the
                 four major browser vendors have risen to the challenge
                 and collaboratively designed a portable low-level
                 bytecode called WebAssembly. It offers compact
                 representation, efficient validation and compilation,
                 and safe low to no-overhead execution. Rather than
                 committing to a specific programming model, WebAssembly
                 is an abstraction over modern hardware, making it
                 language-, hardware-, and platform-independent, with
                 use cases beyond just the Web. WebAssembly has been
                 designed with a formal semantics from the start. We
                 describe the motivation, design and formal semantics of
                 WebAssembly and provide some preliminary experience
                 with implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Petrashko:2017:MCU,
  author =       "Dmitry Petrashko and Ondrej Lhot{\'a}k and Martin
                 Odersky",
  title =        "Miniphases: compilation using modular and efficient
                 tree transformations",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "201--216",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062346",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Production compilers commonly perform dozens of
                 transformations on an intermediate representation.
                 Running those transformations in separate passes harms
                 performance. One approach to recover performance is to
                 combine transformations by hand in order to reduce
                 number of passes. Such an approach harms modularity,
                 and thus makes it hard to maintain and evolve a
                 compiler over the long term, and makes reasoning about
                 performance harder. This paper describes a methodology
                 that allows a compiler writer to define multiple
                 transformations separately, but fuse them into a single
                 traversal of the intermediate representation when the
                 compiler runs. This approach has been implemented in a
                 compiler for the Scala language. Our performance
                 evaluation indicates that this approach reduces the
                 running time of tree transformations by 35\% and shows
                 that this is due to improved cache friendliness. At the
                 same time, the approach improves total memory
                 consumption by reducing the object tenuring rate by
                 50\%. This approach enables compiler writers to write
                 transformations that are both modular and fast at the
                 same time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Canino:2017:PAE,
  author =       "Anthony Canino and Yu David Liu",
  title =        "Proactive and adaptive energy-aware programming with
                 mixed typechecking",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "217--232",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062356",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Application-level energy management is an important
                 dimension of energy optimization. In this paper, we
                 introduce ENT, a novel programming language for
                 enabling *proactive* and *adaptive* mode-based energy
                 management at the application level. The proactive
                 design allows programmers to apply their application
                 knowledge to energy management, by characterizing the
                 energy behavior of different program fragments with
                 modes. The adaptive design allows such characterization
                 to be delayed until run time, useful for capturing
                 dynamic program behavior dependent on program states,
                 configuration settings, external battery levels, or CPU
                 temperatures. The key insight is both proactiveness and
                 adaptiveness can be unified under a type system
                 combined with static typing and dynamic typing. ENT has
                 been implemented as an extension to Java, and
                 successfully ported to three energy-conscious
                 platforms: an Intel-based laptop, a Raspberry Pi, and
                 an Android phone. Evaluation shows ENT improves the
                 programmability, debuggability, and energy efficiency
                 of battery-aware and temperature-aware programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Kedia:2017:SFS,
  author =       "Piyus Kedia and Manuel Costa and Matthew Parkinson and
                 Kapil Vaswani and Dimitrios Vytiniotis and Aaron
                 Blankstein",
  title =        "Simple, fast, and safe manual memory management",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "233--247",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062376",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Safe programming languages are readily available, but
                 many applications continue to be written in unsafe
                 languages because of efficiency. As a consequence, many
                 applications continue to have exploitable memory safety
                 bugs. Since garbage collection is a major source of
                 inefficiency in the implementation of safe languages,
                 replacing it with safe manual memory management would
                 be an important step towards solving this problem.
                 Previous approaches to safe manual memory management
                 use programming models based on regions, unique
                 pointers, borrowing of references, and ownership types.
                 We propose a much simpler programming model that does
                 not require any of these concepts. Starting from the
                 design of an imperative type safe language (like Java
                 or C\#), we just add a delete operator to free memory
                 explicitly and an exception which is thrown if the
                 program dereferences a pointer to freed memory. We
                 propose an efficient implementation of this programming
                 model that guarantees type safety. Experimental results
                 from our implementation based on the C\# native
                 compiler show that this design achieves up to 3x
                 reduction in peak working set and run time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Kincaid:2017:CRA,
  author =       "Zachary Kincaid and Jason Breck and Ashkan Forouhi
                 Boroujeni and Thomas Reps",
  title =        "Compositional recurrence analysis revisited",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "248--262",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062373",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compositional recurrence analysis (CRA) is a
                 static-analysis method based on a combination of
                 symbolic analysis and abstract interpretation. This
                 paper addresses the problem of creating a
                 context-sensitive interprocedural version of CRA that
                 handles recursive procedures. The problem is
                 non-trivial because there is an ``impedance mismatch''
                 between CRA, which relies on analysis techniques based
                 on regular languages (i.e., Tarjan's path-expression
                 method), and the context-free-language underpinnings of
                 context-sensitive analysis. We show how to address this
                 impedance mismatch by augmenting the CRA abstract
                 domain with additional operations. We call the
                 resulting algorithm Interprocedural CRA (ICRA). Our
                 experiments with ICRA show that it has broad overall
                 strength compared with several state-of-the-art
                 software model checkers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Thiessen:2017:CTP,
  author =       "Rei Thiessen and Ondrej Lhot{\'a}k",
  title =        "Context transformations for pointer analysis",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "263--277",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062359",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Points-to analysis for Java benefits greatly from
                 context sensitivity. CFL-reachability and k -limited
                 context strings are two approaches to obtaining context
                 sensitivity with different advantages: CFL-reachability
                 allows local reasoning about data-value flow and thus
                 is suitable for demand-driven analyses, whereas k
                 -limited analyses allow object sensitivity which is a
                 superior calling context abstraction for
                 object-oriented languages. We combine the advantages of
                 both approaches to obtain a context-sensitive analysis
                 that is as precise as k -limited context strings, but
                 is more efficient to compute. Our key insight is based
                 on a novel abstraction of contexts adapted from
                 CFL-reachability that represents a relation between two
                 calling contexts as a composition of transformations
                 over contexts. We formulate pointer analysis in an
                 algebraic structure of context transformations, which
                 is a set of functions over calling contexts closed
                 under function composition. We show that the context
                 representation of context-string-based analyses is an
                 explicit enumeration of all input and output values of
                 context transformations. CFL-reachability-based pointer
                 analysis is formulated to use call-strings as contexts,
                 but the context transformations concept can be applied
                 to any context abstraction used in k -limited analyses,
                 including object- and type-sensitive analysis. The
                 result is a more efficient algorithm for computing
                 context-sensitive results for a wide variety of context
                 configurations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Tan:2017:EPP,
  author =       "Tian Tan and Yue Li and Jingling Xue",
  title =        "Efficient and precise points-to analysis: modeling the
                 heap by merging equivalent automata",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "278--291",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062360",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mainstream points-to analysis techniques for
                 object-oriented languages rely predominantly on the
                 allocation-site abstraction to model heap objects. We
                 present MAHJONG, a novel heap abstraction that is
                 specifically developed to address the needs of an
                 important class of type-dependent clients, such as call
                 graph construction, devirtualization and may-fail
                 casting. By merging equivalent automata representing
                 type-consistent objects that are created by the
                 allocation-site abstraction, MAHJONG enables an
                 allocation-site-based points-to analysis to run
                 significantly faster while achieving nearly the same
                 precision for type-dependent clients. MAHJONG is simple
                 conceptually, efficient, and drops easily on any
                 allocation-site-based points-to analysis. We
                 demonstrate its effectiveness by discussing some
                 insights on why it is a better alternative of the
                 allocation-site abstraction for type-dependent clients
                 and evaluating it extensively on 12 large real-world
                 Java programs with five context-sensitive points-to
                 analyses and three widely used type-dependent clients.
                 MAHJONG is expected to provide significant benefits for
                 many program analyses where call graphs are required.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Santhiar:2017:SDD,
  author =       "Anirudh Santhiar and Aditya Kanade",
  title =        "Static deadlock detection for asynchronous {C\#}
                 programs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "292--305",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062361",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Asynchronous programming is a standard approach for
                 designing responsive applications. Modern languages
                 such as C\# provide async/await primitives for the
                 disciplined use of asynchrony. In spite of this,
                 programs can deadlock because of incorrect use of
                 blocking operations along with non-blocking
                 (asynchronous) operations. While developers are aware
                 of this problem, there is no automated technique to
                 detect deadlocks in asynchronous programs. We present a
                 novel representation of control flow and scheduling of
                 asynchronous programs, called continuation scheduling
                 graph and formulate necessary conditions for a deadlock
                 to occur in a program. We design static analyses to
                 construct continuation scheduling graphs of
                 asynchronous C\# programs and to identify deadlocks in
                 them. We have implemented the static analyses in a tool
                 called DeadWait. Using DeadWait, we found 43 previously
                 unknown deadlocks in 11 asynchronous C\# libraries. We
                 reported the deadlocks to the library developers. They
                 have confirmed and fixed 40 of them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Fu:2017:AHC,
  author =       "Zhoulai Fu and Zhendong Su",
  title =        "Achieving high coverage for floating-point code via
                 unconstrained programming",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "306--319",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062383",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Achieving high code coverage is essential in testing,
                 which gives us confidence in code quality. Testing
                 floating-point code usually requires painstaking
                 efforts in handling floating-point constraints, e.g.,
                 in symbolic execution. This paper turns the challenge
                 of testing floating-point code into the opportunity of
                 applying unconstrained programming --- the mathematical
                 solution for calculating function minimum points over
                 the entire search space. Our core insight is to derive
                 a representing function from the floating-point
                 program, any of whose minimum points is a test input
                 guaranteed to exercise a new branch of the tested
                 program. This guarantee allows us to achieve high
                 coverage of the floating-point program by repeatedly
                 minimizing the representing function. We have realized
                 this approach in a tool called CoverMe and conducted an
                 extensive evaluation of it on Sun's C math library. Our
                 evaluation results show that CoverMe achieves, on
                 average, 90.8\% branch coverage in 6.9 seconds,
                 drastically outperforming our compared tools: (1)
                 Random testing, (2) AFL, a highly optimized, robust
                 fuzzer released by Google, and (3) Austin, a
                 state-of-the-art coverage-based testing tool designed
                 to support floating-point code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Chamith:2017:IPL,
  author =       "Buddhika Chamith and Bo Joel Svensson and Luke
                 Dalessandro and Ryan R. Newton",
  title =        "Instruction punning: lightweight instrumentation for
                 x86-64",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "320--332",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062344",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Existing techniques for injecting probes into running
                 applications are limited; they either fail to support
                 probing arbitrary locations, or to support scalable,
                 rapid toggling of probes. We introduce a new technique
                 on x86-64, called instruction punning, which allows
                 scalable probes at any instruction. The key idea is
                 that when we inject a jump instruction, the relative
                 address of the jump serves simultaneously as data and
                 as an instruction sequence. We show that this approach
                 achieves probe invocation overheads of only a few dozen
                 cycles, and probe activation/deactivation costs that
                 are cheaper than a system call, even when all threads
                 in the system are both invoking probes and toggling
                 them.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{DAntras:2017:LOD,
  author =       "Amanieu D'Antras and Cosmin Gorgovan and Jim Garside
                 and Mikel Luj{\'a}n",
  title =        "Low overhead dynamic binary translation on {ARM}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "333--346",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062371",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The ARMv8 architecture introduced AArch64, a 64-bit
                 execution mode with a new instruction set, while
                 retaining binary compatibility with previous versions
                 of the ARM architecture through AArch32, a 32-bit
                 execution mode. Most hardware implementations of ARMv8
                 processors support both AArch32 and AArch64, which
                 comes at a cost in hardware complexity. We present
                 MAMBO-X64, a dynamic binary translator for Linux which
                 executes 32-bit ARM binaries using only the AArch64
                 instruction set. We have evaluated the performance of
                 MAMBO-X64 on three existing ARMv8 processors which
                 support both AArch32 and AArch64 instruction sets. The
                 performance was measured by comparing the running time
                 of 32-bit benchmarks running under MAMBO-X64 with the
                 same benchmark running natively. On SPEC CPU2006, we
                 achieve a geometric mean overhead of less than 7.5\% on
                 in-order Cortex-A53 processors and a performance
                 improvement of 1\% on out-of-order X-Gene 1 processors.
                 MAMBO-X64 achieves such low overhead by novel
                 optimizations to map AArch32 floating-point registers
                 to AArch64 registers dynamically, handle overflowing
                 address calculations efficiently, generate traces that
                 harness hardware return address prediction, and handle
                 operating system signals accurately.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Zhang:2017:SPE,
  author =       "Qirun Zhang and Chengnian Sun and Zhendong Su",
  title =        "Skeletal program enumeration for rigorous compiler
                 testing",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "347--361",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062379",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A program can be viewed as a syntactic structure P
                 (syntactic skeleton) parameterized by a collection of
                 identifiers V (variable names). This paper introduces
                 the skeletal program enumeration (SPE) problem: Given a
                 syntactic skeleton P and a set of variables V ,
                 enumerate a set of programs P exhibiting all possible
                 variable usage patterns within P. It proposes an
                 effective realization of SPE for systematic, rigorous
                 compiler testing by leveraging three important
                 observations: (1) Programs with different variable
                 usage patterns exhibit diverse control- and
                 data-dependence, and help exploit different compiler
                 optimizations; (2) most real compiler bugs were
                 revealed by small tests (i.e., small-sized P) --- this
                 ``small-scope'' observation opens up SPE for practical
                 compiler validation; and (3) SPE is exhaustive w.r.t. a
                 given syntactic skeleton and variable set, offering a
                 level of guarantee absent from all existing compiler
                 testing techniques. The key challenge of SPE is how to
                 eliminate the enormous amount of equivalent programs
                 w.r.t. \alpha -conversion. Our main technical
                 contribution is a novel algorithm for computing the
                 canonical (and smallest) set of all non- \alpha
                 -equivalent programs. To demonstrate its practical
                 utility, we have applied the SPE technique to test
                 C/C++ compilers using syntactic skeletons derived from
                 their own regression test-suites. Our evaluation
                 results are extremely encouraging. In less than six
                 months, our approach has led to 217 confirmed GCC/Clang
                 bug reports, 119 of which have already been fixed, and
                 the majority are long latent despite extensive prior
                 testing efforts. Our SPE algorithm also provides six
                 orders of magnitude reduction. Moreover, in three
                 weeks, our technique has found 29 CompCert crashing
                 bugs and 42 bugs in two Scala optimizing compilers.
                 These results demonstrate our SPE technique's
                 generality and further illustrate its effectiveness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Antonopoulos:2017:DIS,
  author =       "Timos Antonopoulos and Paul Gazzillo and Michael Hicks
                 and Eric Koskinen and Tachio Terauchi and Shiyi Wei",
  title =        "Decomposition instead of self-composition for proving
                 the absence of timing channels",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "362--375",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062378",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a novel approach to proving the absence of
                 timing channels. The idea is to partition the program's
                 execution traces in such a way that each partition
                 component is checked for timing attack resilience by a
                 time complexity analysis and that per-component
                 resilience implies the resilience of the whole program.
                 We construct a partition by splitting the program
                 traces at secret-independent branches. This ensures
                 that any pair of traces with the same public input has
                 a component containing both traces. Crucially, the
                 per-component checks can be normal safety properties
                 expressed in terms of a single execution. Our approach
                 is thus in contrast to prior approaches, such as
                 self-composition, that aim to reason about multiple $
                 (k \geq 2) $ executions at once. We formalize the above
                 as an approach called quotient partitioning,
                 generalized to any k -safety property, and prove it to
                 be sound. A key feature of our approach is a
                 demand-driven partitioning strategy that uses a
                 regex-like notion called trails to identify sets of
                 execution traces, particularly those influenced by
                 tainted (or secret) data. We have applied our technique
                 in a prototype implementation tool called Blazer, based
                 on WALA, PPL, and the brics automaton library. We have
                 proved timing-channel freedom of (or synthesized an
                 attack specification for) 24 programs written in Java
                 bytecode, including 6 classic examples from the
                 literature and 6 examples extracted from the DARPA STAC
                 challenge problems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Hu:2017:API,
  author =       "Qinheping Hu and Loris D'Antoni",
  title =        "Automatic program inversion using symbolic
                 transducers",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "376--389",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062345",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a fully-automated technique for inverting
                 functional programs that operate over lists such as
                 string encoders and decoders. We consider programs that
                 can be modeled using symbolic extended finite
                 transducers (), an expressive model that can describe
                 complex list-manipulating programs while retaining
                 several decidable properties. Concretely, given a
                 program P expressed as an , we propose techniques for:
                 (1) checking whether P is injective and, if that is the
                 case, (2) building an P$^{-1}$ describing its inverse.
                 We first show that it is undecidable to check whether
                 an is injective and propose an algorithm for checking
                 injectivity for a restricted, but a practical class of
                 . We then propose an algorithm for inverting based on
                 the following idea: if an is injective, inverting it
                 amounts to inverting all its individual transitions. We
                 leverage recent advances program synthesis and show
                 that the transition inversion problem can be expressed
                 as an instance of the syntax-guided synthesis
                 framework. Finally, we implement the proposed
                 techniques in a tool called and show that can invert 13
                 out 14 real complex string encoders and decoders,
                 producing inverse programs that are substantially
                 identical to manually written ones.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Ohmann:2017:CFR,
  author =       "Peter Ohmann and Alexander Brooks and Loris D'Antoni
                 and Ben Liblit",
  title =        "Control-flow recovery from partial failure reports",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "390--405",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062368",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Debugging is difficult. When software fails in
                 production, debugging is even harder, as failure
                 reports usually provide only an incomplete picture of
                 the failing execution. We present a system that answers
                 control-flow queries posed by developers as formal
                 languages, indicating whether the query expresses
                 control flow that is possible or impossible for a given
                 failure report. We consider three separate approaches
                 that trade off precision, expressiveness for failure
                 constraints, and scalability. We also introduce a new
                 subclass of regular languages, the unreliable trace
                 languages, which are particularly suited to answering
                 control-flow queries in polynomial time. Our system
                 answers queries remarkably efficiently when we encode
                 failure constraints and user queries entirely as
                 unreliable trace languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Doychev:2017:RAS,
  author =       "Goran Doychev and Boris K{\"o}pf",
  title =        "Rigorous analysis of software countermeasures against
                 cache attacks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "406--421",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062388",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "CPU caches introduce variations into the execution
                 time of programs that can be exploited by adversaries
                 to recover private information about users or
                 cryptographic keys. Establishing the security of
                 countermeasures against this threat often requires
                 intricate reasoning about the interactions of program
                 code, memory layout, and hardware architecture and has
                 so far only been done for restricted cases. In this
                 paper we devise novel techniques that provide support
                 for bit-level and arithmetic reasoning about memory
                 accesses in the presence of dynamic memory allocation.
                 These techniques enable us to perform the first
                 rigorous analysis of widely deployed software
                 countermeasures against cache attacks on modular
                 exponentiation, based on executable code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Feng:2017:CBSb,
  author =       "Yu Feng and Ruben Martins and Jacob {Van Geffen} and
                 Isil Dillig and Swarat Chaudhuri",
  title =        "Component-based synthesis of table consolidation and
                 transformation tasks from examples",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "422--436",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062351",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a novel component-based synthesis
                 algorithm that marries the power of type-directed
                 search with lightweight SMT-based deduction and partial
                 evaluation. Given a set of components together with
                 their over-approximate first-order specifications, our
                 method first generates a program sketch over a subset
                 of the components and checks its feasibility using an
                 SMT solver. Since a program sketch typically represents
                 many concrete programs, the use of SMT-based deduction
                 greatly increases the scalability of the algorithm.
                 Once a feasible program sketch is found, our algorithm
                 completes the sketch in a bottom-up fashion, using
                 partial evaluation to further increase the power of
                 deduction for rejecting partially-filled program
                 sketches. We apply the proposed synthesis methodology
                 for automating a large class of data preparation tasks
                 that commonly arise in data science. We have evaluated
                 our synthesis algorithm on dozens of data wrangling and
                 consolidation tasks obtained from on-line forums, and
                 we show that our approach can automatically solve a
                 large class of problems encountered by R users.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Beckett:2017:NCS,
  author =       "Ryan Beckett and Ratul Mahajan and Todd Millstein and
                 Jitendra Padhye and David Walker",
  title =        "Network configuration synthesis with abstract
                 topologies",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "437--451",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062367",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We develop Propane/AT, a system to synthesize
                 provably-correct BGP (border gateway protocol)
                 configurations for large, evolving networks from
                 high-level specifications of topology, routing policy,
                 and fault-tolerance requirements. Propane/AT is based
                 on new abstractions for capturing parameterized network
                 topologies and their evolution, and algorithms to
                 analyze the impact of topology and routing policy on
                 fault tolerance. Our algorithms operate entirely on
                 abstract topologies. We prove that the properties
                 established by our analyses hold for every concrete
                 instantiation of the given abstract topology.
                 Propane/AT also guarantees that only incremental
                 changes to existing device configurations are required
                 when the network evolves to add or remove devices and
                 links. Our experiments with real-world topologies and
                 policies show that our abstractions and algorithms are
                 effective, and that, for large networks, Propane/AT
                 synthesizes configurations two orders of magnitude
                 faster than systems that operate on concrete
                 topologies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Wang:2017:SHE,
  author =       "Chenglong Wang and Alvin Cheung and Rastislav Bodik",
  title =        "Synthesizing highly expressive {SQL} queries from
                 input-output examples",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "452--466",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062365",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "SQL is the de facto language for manipulating
                 relational data. Though powerful, many users find it
                 difficult to write SQL queries due to highly expressive
                 constructs. While using the programming-by-example
                 paradigm to help users write SQL queries is an
                 attractive proposition, as evidenced by online help
                 forums such as Stack Overflow, developing techniques
                 for synthesizing SQL queries from given input-output
                 (I/O) examples has been difficult, due to the large
                 space of SQL queries as a result of its rich set of
                 operators. In this paper, we present a new scalable and
                 efficient algorithm for synthesizing SQL queries based
                 on I/O examples. The key innovation of our algorithm is
                 development of a language for abstract queries, i.e.,
                 queries with uninstantiated operators, that can be used
                 to express a large space of SQL queries efficiently.
                 Using abstract queries to represent the search space
                 nicely decomposes the synthesis problem into two tasks:
                 (1) searching for abstract queries that can potentially
                 satisfy the given I/O examples, and (2) instantiating
                 the found abstract queries and ranking the results. We
                 have implemented this algorithm in a new tool called
                 Scythe and evaluated it using 193 benchmarks collected
                 from Stack Overflow. Our evaluation shows that Scythe
                 can efficiently solve 74\% of the benchmarks, most in
                 just a few seconds, and the queries range from simple
                 ones involving a single selection to complex queries
                 with 6 nested subqueires.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Bornholt:2017:SMM,
  author =       "James Bornholt and Emina Torlak",
  title =        "Synthesizing memory models from framework sketches and
                 Litmus tests",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "467--481",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062353",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A memory consistency model specifies which writes to
                 shared memory a given read may see. Ambiguities or
                 errors in these specifications can lead to bugs in both
                 compilers and applications. Yet architectures usually
                 define their memory models with prose and litmus
                 tests-small concurrent programs that demonstrate
                 allowed and forbidden outcomes. Recent work has
                 formalized the memory models of common architectures
                 through substantial manual effort, but as new
                 architectures emerge, there is a growing need for tools
                 to aid these efforts. This paper presents MemSynth, a
                 synthesis-aided system for reasoning about axiomatic
                 specifications of memory models. MemSynth takes as
                 input a set of litmus tests and a framework sketch that
                 defines a class of memory models. The sketch comprises
                 a set of axioms with missing expressions (or holes).
                 Given these inputs, MemSynth synthesizes a completion
                 of the axioms-i.e., a memory model-that gives the
                 desired outcome on all tests. The MemSynth engine
                 employs a novel embedding of bounded relational logic
                 in a solver-aided programming language, which enables
                 it to tackle complex synthesis queries intractable to
                 existing relational solvers. This design also enables
                 it to solve new kinds of queries, such as checking if a
                 set of litmus tests unambiguously defines a memory
                 model within a framework sketch. We show that MemSynth
                 can synthesize specifications for x86 in under two
                 seconds, and for PowerPC in 12 seconds from 768 litmus
                 tests. Our ambiguity check identifies missing tests
                 from both the Intel x86 documentation and the
                 validation suite of a previous PowerPC formalization.
                 We also used MemSynth to reproduce, debug, and
                 automatically repair a paper on comparing memory models
                 in just two days.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Maurer:2017:CC,
  author =       "Luke Maurer and Paul Downen and Zena M. Ariola and
                 Simon Peyton Jones",
  title =        "Compiling without continuations",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "482--494",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062380",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many fields of study in compilers give rise to the
                 concept of a join point-a place where different
                 execution paths come together. Join points are often
                 treated as functions or continuations, but we believe
                 it is time to study them in their own right. We show
                 that adding join points to a direct-style functional
                 intermediate language is a simple but powerful change
                 that allows new optimizations to be performed,
                 including a significant improvement to list fusion.
                 Finally, we report on recent work on adding join points
                 to the intermediate language of the Glasgow Haskell
                 Compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Patterson:2017:FRM,
  author =       "Daniel Patterson and Jamie Perconti and Christos
                 Dimoulas and Amal Ahmed",
  title =        "{FunTAL}: reasonably mixing a functional language with
                 assembly",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "495--509",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062347",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present FunTAL, the first multi-language system to
                 formalize safe interoperability between a high-level
                 functional language and low-level assembly code while
                 supporting compositional reasoning about the mix. A
                 central challenge in developing such a multi-language
                 is bridging the gap between assembly, which is staged
                 into jumps to continuations, and high-level code, where
                 subterms return a result. We present a compositional
                 stack-based typed assembly language that supports
                 components, comprised of one or more basic blocks, that
                 may be embedded in high-level contexts. We also present
                 a logical relation for FunTAL that supports reasoning
                 about equivalence of high-level components and their
                 assembly replacements, mixed-language programs with
                 callbacks between languages, and assembly components
                 comprised of different numbers of basic blocks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Chu:2017:HPQ,
  author =       "Shumo Chu and Konstantin Weitz and Alvin Cheung and
                 Dan Suciu",
  title =        "{HoTTSQL}: proving query rewrites with univalent {SQL}
                 semantics",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "510--524",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062348",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Every database system contains a query optimizer that
                 performs query rewrites. Unfortunately, developing
                 query optimizers remains a highly challenging task.
                 Part of the challenges comes from the intricacies and
                 rich features of query languages, which makes reasoning
                 about rewrite rules difficult. In this paper, we
                 propose a machine-checkable denotational semantics for
                 SQL, the de facto language for relational database, for
                 rigorously validating rewrite rules. Unlike previously
                 proposed semantics that are either non-mechanized or
                 only cover a small amount of SQL language features, our
                 semantics covers all major features of SQL, including
                 bags, correlated subqueries, aggregation, and indexes.
                 Our mechanized semantics, called HoTT SQL, is based on
                 K-Relations and homotopy type theory, where we denote
                 relations as mathematical functions from tuples to
                 univalent types. We have implemented HoTTSQL in Coq,
                 which takes only fewer than 300 lines of code and have
                 proved a wide range of SQL rewrite rules, including
                 those from database research literature (e.g., magic
                 set rewrites) and real-world query optimizers (e.g.,
                 subquery elimination). Several of these rewrite rules
                 have never been previously proven correct. In addition,
                 while query equivalence is generally undecidable, we
                 have implemented an automated decision procedure using
                 HoTTSQL for conjunctive queries: a well studied
                 decidable fragment of SQL that encompasses many
                 real-world queries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Eisenberg:2017:LP,
  author =       "Richard A. Eisenberg and Simon Peyton Jones",
  title =        "Levity polymorphism",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "525--539",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062357",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parametric polymorphism is one of the linchpins of
                 modern typed programming, but it comes with a real
                 performance penalty. We describe this penalty; offer a
                 principled way to reason about it (kinds as calling
                 conventions); and propose levity polymorphism. This new
                 form of polymorphism allows abstractions over calling
                 conventions; we detail and verify restrictions that are
                 necessary in order to compile levity-polymorphic
                 functions. Levity polymorphism has created new
                 opportunities in Haskell, including the ability to
                 generalize nearly half of the type classes in GHC's
                 standard library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Farzan:2017:SDC,
  author =       "Azadeh Farzan and Victor Nicolet",
  title =        "Synthesis of divide and conquer parallelism for
                 loops",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "540--555",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062355",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Divide-and-conquer is a common parallel programming
                 skeleton supported by many cross-platform multithreaded
                 libraries, and most commonly used by programmers for
                 parallelization. The challenges of producing (manually
                 or automatically) a correct divide-and-conquer parallel
                 program from a given sequential code are two-fold: (1)
                 assuming that a good solution exists where individual
                 worker threads execute a code identical to the
                 sequential one, the programmer has to provide the extra
                 code for dividing the tasks and combining the partial
                 results (i.e. joins), and (2) the sequential code may
                 not be suitable for divide-and-conquer parallelization
                 as is, and may need to be modified to become a part of
                 a good solution. We address both challenges in this
                 paper. We present an automated synthesis technique to
                 synthesize correct joins and an algorithm for modifying
                 the sequential code to make it suitable for
                 parallelization when necessary. This paper focuses on
                 class of loops that traverse a read-only collection and
                 compute a scalar function over that collection. We
                 present theoretical results for when the necessary
                 modifications to sequential code are possible,
                 theoretical guarantees for the algorithmic solutions
                 presented here, and experimental evaluation of the
                 approach's success in practice and the quality of the
                 produced parallel programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Henriksen:2017:FPF,
  author =       "Troels Henriksen and Niels G. W. Serup and Martin
                 Elsman and Fritz Henglein and Cosmin E. Oancea",
  title =        "{Futhark}: purely functional {GPU-programming} with
                 nested parallelism and in-place array updates",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "556--571",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062354",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Futhark is a purely functional data-parallel array
                 language that offers a machine-neutral programming
                 model and an optimising compiler that generates OpenCL
                 code for GPUs. This paper presents the design and
                 implementation of three key features of Futhark that
                 seek a suitable middle ground with imperative
                 approaches. First, in order to express efficient code
                 inside the parallel constructs, we introduce a simple
                 type system for in-place updates that ensures
                 referential transparency and supports equational
                 reasoning. Second, we furnish Futhark with parallel
                 operators capable of expressing efficient
                 strength-reduced code, along with their fusion rules.
                 Third, we present a flattening transformation aimed at
                 enhancing the degree of parallelism that (i) builds on
                 loop interchange and distribution but uses higher-order
                 reasoning rather than array-dependence analysis, and
                 (ii) still allows further locality-of-reference
                 optimisations. Finally, an evaluation on 16 benchmarks
                 demonstrates the impact of the language and compiler
                 features and shows application-level performance
                 competitive with hand-written GPU code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Fedyukovich:2017:GSS,
  author =       "Grigory Fedyukovich and Maaz Bin Safeer Ahmad and
                 Rastislav Bodik",
  title =        "Gradual synthesis for static parallelization of
                 single-pass array-processing programs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "572--585",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062382",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Parallelizing of software improves its effectiveness
                 and productivity. To guarantee correctness, the
                 parallel and serial versions of the same code must be
                 formally verified to be equivalent. We present a novel
                 approach, called GRASSP, that automatically synthesizes
                 parallel single-pass array-processing programs by
                 treating the given serial versions as specifications.
                 Given arbitrary segmentation of the input array, GRASSP
                 synthesizes a code to determine a new segmentation of
                 the array that allows computing partial results for
                 each segment and merging them. In contrast to other
                 parallelizers, GRASSP gradually considers several
                 parallelization scenarios and certifies the results
                 using constrained Horn solving. For several classes of
                 programs, we show that such parallelization can be
                 performed efficiently. The C++ translations of the
                 GRASSP solutions sped performance by up to 5X relative
                 to serial code on an 8-thread machine and Hadoop
                 translations by up to 10X on a 10-node Amazon EMR
                 cluster.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Bourke:2017:FVC,
  author =       "Timothy Bourke and L{\'e}lio Brun and
                 Pierre-{\'E}variste Dagand and Xavier Leroy and Marc
                 Pouzet and Lionel Rieg",
  title =        "A formally verified compiler for {Lustre}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "586--601",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062358",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The correct compilation of block diagram languages
                 like Lustre, Scade, and a discrete subset of Simulink
                 is important since they are used to program critical
                 embedded control software. We describe the
                 specification and verification in an Interactive
                 Theorem Prover of a compilation chain that treats the
                 key aspects of Lustre: sampling, nodes, and delays.
                 Building on CompCert, we show that repeated execution
                 of the generated assembly code faithfully implements
                 the dataflow semantics of source programs. We resolve
                 two key technical challenges. The first is the change
                 from a synchronous dataflow semantics, where programs
                 manipulate streams of values, to an imperative one,
                 where computations manipulate memory sequentially. The
                 second is the verified compilation of an imperative
                 language with encapsulated state to C code where the
                 state is realized by nested records. We also treat a
                 standard control optimization that eliminates
                 unnecessary conditional statements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Abdulla:2017:FCF,
  author =       "Parosh Aziz Abdulla and Mohamed Faouzi Atig and
                 Yu-Fang Chen and Bui Phi Diep and Luk{\'a}s Hol{\'\i}k
                 and Ahmed Rezine and Philipp R{\"u}mmer",
  title =        "Flatten and conquer: a framework for efficient
                 analysis of string constraints",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "602--617",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062384",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a uniform and efficient framework for
                 checking the satisfiability of a large class of string
                 constraints. The framework is based on the observation
                 that both satisfiability and unsatisfiability of common
                 constraints can be demonstrated through witnesses with
                 simple patterns. These patterns are captured using flat
                 automata each of which consists of a sequence of simple
                 loops. We build a Counter-Example Guided Abstraction
                 Refinement (CEGAR) framework which contains both an
                 under- and an over-approximation module. The flow of
                 information between the modules allows to increase the
                 precision in an automatic manner. We have implemented
                 the framework as a tool and performed extensive
                 experimentation that demonstrates both the generality
                 and efficiency of our method.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Lahav:2017:RSC,
  author =       "Ori Lahav and Viktor Vafeiadis and Jeehoon Kang and
                 Chung-Kil Hur and Derek Dreyer",
  title =        "Repairing sequential consistency in {C\slash C++11}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "618--632",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062352",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The C/C++11 memory model defines the semantics of
                 concurrent memory accesses in C/C++, and in particular
                 supports racy ``atomic'' accesses at a range of
                 different consistency levels, from very weak
                 consistency (``relaxed'') to strong, sequential
                 consistency (``SC''). Unfortunately, as we observe in
                 this paper, the semantics of SC atomic accesses in
                 C/C++11, as well as in all proposed strengthenings of
                 the semantics, is flawed, in that (contrary to
                 previously published results) both suggested
                 compilation schemes to the Power architecture are
                 unsound. We propose a model, called RC11 (for Repaired
                 C11), with a better semantics for SC accesses that
                 restores the soundness of the compilation schemes to
                 Power, maintains the DRF-SC guarantee, and provides
                 stronger, more useful, guarantees to SC fences. In
                 addition, we formally prove, for the first time, the
                 correctness of the proposed stronger compilation
                 schemes to Power that preserve load-to-store ordering
                 and avoid ``out-of-thin-air'' reads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Lee:2017:TUB,
  author =       "Juneyoung Lee and Yoonseung Kim and Youngju Song and
                 Chung-Kil Hur and Sanjoy Das and David Majnemer and
                 John Regehr and Nuno P. Lopes",
  title =        "Taming undefined behavior in {LLVM}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "633--647",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062343",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A central concern for an optimizing compiler is the
                 design of its intermediate representation (IR) for
                 code. The IR should make it easy to perform
                 transformations, and should also afford efficient and
                 precise static analysis. In this paper we study an
                 aspect of IR design that has received little attention:
                 the role of undefined behavior. The IR for every
                 optimizing compiler we have looked at, including GCC,
                 LLVM, Intel's, and Microsoft's, supports one or more
                 forms of undefined behavior (UB), not only to reflect
                 the semantics of UB-heavy programming languages such as
                 C and C++, but also to model inherently unsafe
                 low-level operations such as memory stores and to avoid
                 over-constraining IR semantics to the point that
                 desirable transformations become illegal. The current
                 semantics of LLVM's IR fails to justify some cases of
                 loop unswitching, global value numbering, and other
                 important ``textbook'' optimizations, causing
                 long-standing bugs. We present solutions to the
                 problems we have identified in LLVM's IR and show that
                 most optimizations currently in LLVM remain sound, and
                 that some desirable new transformations become
                 permissible. Our solutions do not degrade compile time
                 or performance of generated code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Schneider:2017:LSM,
  author =       "Scott Schneider and Kun-Lung Wu",
  title =        "Low-synchronization, mostly lock-free, elastic
                 scheduling for streaming runtimes",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "648--661",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062366",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the scalable, elastic operator scheduler in
                 IBM Streams 4.2. Streams is a distributed stream
                 processing system used in production at many companies
                 in a wide range of industries. The programming language
                 for Streams, SPL, presents operators, tuples and
                 streams as the primary abstractions. A fundamental SPL
                 optimization is operator fusion, where multiple
                 operators execute in the same process. Streams 4.2
                 introduces automatic submission-time fusion to simplify
                 application development and deployment. However,
                 potentially thousands of operators could then execute
                 in the same process, with no user guidance for thread
                 placement. We needed a way to automatically figure out
                 how many threads to use, with arbitrarily sized
                 applications on a wide variety of hardware, and without
                 any input from programmers. Our solution has two
                 components. The first is a scalable operator scheduler
                 that minimizes synchronization, locks and global data,
                 while allowing threads to execute any operator and
                 dynamically come and go. The second is an elastic
                 algorithm to dynamically adjust the number of threads
                 to optimize performance, using the principles of
                 trusted measurements to establish trends. We
                 demonstrate our scheduler's ability to scale to over a
                 hundred threads, and our elasticity algorithm's ability
                 to adapt to different workloads on an Intel Xeon system
                 with 176 logical cores, and an IBM Power8 system with
                 184 logical cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Wurthinger:2017:PPE,
  author =       "Thomas W{\"u}rthinger and Christian Wimmer and
                 Christian Humer and Andreas W{\"o}{\ss} and Lukas
                 Stadler and Chris Seaton and Gilles Duboscq and Doug
                 Simon and Matthias Grimmer",
  title =        "Practical partial evaluation for high-performance
                 dynamic language runtimes",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "662--676",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062381",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most high-performance dynamic language virtual
                 machines duplicate language semantics in the
                 interpreter, compiler, and runtime system. This
                 violates the principle to not repeat yourself. In
                 contrast, we define languages solely by writing an
                 interpreter. The interpreter performs specializations,
                 e.g., augments the interpreted program with type
                 information and profiling information. Compiled code is
                 derived automatically using partial evaluation while
                 incorporating these specializations. This makes partial
                 evaluation practical in the context of dynamic
                 languages: It reduces the size of the compiled code
                 while still compiling all parts of an operation that
                 are relevant for a particular program. When a
                 speculation fails, execution transfers back to the
                 interpreter, the program re-specializes in the
                 interpreter, and later partial evaluation again
                 transforms the new state of the interpreter to compiled
                 code. We evaluate our approach by comparing our
                 implementations of JavaScript, Ruby, and R with
                 best-in-class specialized production implementations.
                 Our general-purpose compilation system is competitive
                 with production systems even when they have been
                 heavily optimized for the one language they support.
                 For our set of benchmarks, our speedup relative to the
                 V8 JavaScript VM is 0.83x, relative to JRuby is 3.8x,
                 and relative to GNU R is 5x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Muller:2017:RPC,
  author =       "Stefan K. Muller and Umut A. Acar and Robert Harper",
  title =        "Responsive parallel computation: bridging competitive
                 and cooperative threading",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "677--692",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062370",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Competitive and cooperative threading are widely used
                 abstractions in computing. In competitive threading,
                 threads are scheduled preemptively with the goal of
                 minimizing response time, usually of interactive
                 applications. In cooperative threading, threads are
                 scheduled non-preemptively with the goal of maximizing
                 throughput or minimizing the completion time, usually
                 in compute-intensive applications, e.g. scientific
                 computing, machine learning and AI. Although both of
                 these forms of threading rely on the same abstraction
                 of a thread, they have, to date, remained largely
                 separate forms of computing. Motivated by the recent
                 increase in the mainstream use of multicore computers,
                 we propose a threading model that aims to unify
                 competitive and cooperative threading. To this end, we
                 extend the classic graph-based cost model for
                 cooperative threading to allow for competitive
                 threading, and describe how such a cost model may be
                 used in a programming language by presenting a language
                 and a corresponding cost semantics. Finally, we show
                 that the cost model and the semantics are realizable by
                 presenting an operational semantics for the language
                 that specifies the behavior of an implementation, as
                 well as an implementation and a small empirical
                 evaluation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Mamouras:2017:SMS,
  author =       "Konstantinos Mamouras and Mukund Raghothaman and
                 Rajeev Alur and Zachary G. Ives and Sanjeev Khanna",
  title =        "{StreamQRE}: modular specification and efficient
                 evaluation of quantitative queries over streaming
                 data",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "6",
  pages =        "693--708",
  month =        jun,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140587.3062369",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Real-time decision making in emerging IoT applications
                 typically relies on computing quantitative summaries of
                 large data streams in an efficient and incremental
                 manner. To simplify the task of programming the desired
                 logic, we propose StreamQRE, which provides natural and
                 high-level constructs for processing streaming data.
                 Our language has a novel integration of linguistic
                 constructs from two distinct programming paradigms:
                 streaming extensions of relational query languages and
                 quantitative extensions of regular expressions. The
                 former allows the programmer to employ relational
                 constructs to partition the input data by keys and to
                 integrate data streams from different sources, while
                 the latter can be used to exploit the logical hierarchy
                 in the input stream for modular specifications. We
                 first present the core language with a small set of
                 combinators, formal semantics, and a decidable type
                 system. We then show how to express a number of common
                 patterns with illustrative examples. Our compilation
                 algorithm translates the high-level query into a
                 streaming algorithm with precise complexity bounds on
                 per-item processing time and total memory footprint. We
                 also show how to integrate approximation algorithms
                 into our framework. We report on an implementation in
                 Java, and evaluate it with respect to existing
                 high-performance engines for processing streaming data.
                 Our experimental evaluation shows that (1) StreamQRE
                 allows more natural and succinct specification of
                 queries compared to existing frameworks, (2) the
                 throughput of our implementation is higher than
                 comparable systems (for example, two-to-four times
                 greater than RxJava), and (3) the approximation
                 algorithms supported by our implementation can lead to
                 substantial memory savings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '17 conference proceedings.",
}

@Article{Weirich:2017:IDT,
  author =       "Stephanie Weirich",
  title =        "The influence of dependent types (keynote)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009923",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "What has dependent type theory done for Haskell? In
                 this talk, I will discuss the influence of dependent
                 types on the design of programming languages and on the
                 practice of functional programmers. Over the past ten
                 years, the Glasgow Haskell compiler has adopted several
                 type system features inspired by dependent type theory.
                 However, this process has not been a direct
                 translation; working in the context of an existing
                 language has lead us to new designs in the semantics of
                 dependent types. I will take a close look at what we
                 have achieved in GHC and discuss what we have learned
                 from this experiment: what works now, what doesn't work
                 yet, and what has surprised us along the way.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Turon:2017:RPP,
  author =       "Aaron Turon",
  title =        "{Rust}: from {POPL} to practice (keynote)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3011999",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In 2015, a language based fundamentally on
                 substructural typing --- Rust --- hit its 1.0 release,
                 and less than a year later it has been put into
                 production use in a number of tech companies, including
                 some household names. The language has started a trend,
                 with several other mainstream languages, including C++
                 and Swift, in the early stages of incorporating ideas
                 about ownership. How did this come about? Rust's core
                 focus is safe systems programming. It does not require
                 a runtime system or garbage collector, but guarantees
                 memory safety. It does not stipulate any particular
                 style of concurrent programming, but instead provides
                 the tools needed to guarantee data race freedom even
                 when doing low-level shared-state concurrency. It
                 allows you to build up high-level abstractions without
                 paying a tax; its compilation model ensures that the
                 abstractions boil away. These benefits derive from two
                 core aspects of Rust: its ownership system (based on
                 substructural typing) and its trait system (a
                 descendant of Haskell's typeclasses). The talk will
                 cover these two pillars of Rust design, with particular
                 attention to the key innovations that make the language
                 usable at scale. It will highlight the implications for
                 concurrency, where Rust provides a unique perspective.
                 It will also touch on aspects of Rust's development
                 that tend to get less attention within the POPL
                 community: Rust's governance and open development
                 process, and design considerations around language and
                 library evolution. Finally, it will mention a few of
                 the myriad open research questions around Rust.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Alglave:2017:OPI,
  author =       "Jade Alglave and Patrick Cousot",
  title =        "Ogre and {Pythia}: an invariance proof method for weak
                 consistency models",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "3--18",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009883",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We design an invariance proof method for concurrent
                 programs parameterised by a weak consistency model. The
                 calculational design of the invariance proof method is
                 by abstract interpretation of a truly parallel analytic
                 semantics. This generalises the methods by Lamport and
                 Owicki-Gries for sequential consistency. We use cat as
                 an example of language to write consistency
                 specifications of both concurrent programs and machine
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Germane:2017:PEA,
  author =       "Kimball Germane and Matthew Might",
  title =        "A posteriori environment analysis with {Pushdown Delta
                 CFA}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "19--31",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009899",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Flow-driven higher-order inlining is blocked by free
                 variables, yet current theories of environment analysis
                 cannot reliably cope with multiply-bound variables. One
                 of these, $ \Delta $CFA, is a promising theory based on
                 stack change but is undermined by its finite-state
                 model of the stack. We present Pushdown $ \Delta $CFA
                 which takes a $ \Delta $CFA -approach to pushdown
                 models of control flow and can cope with multiply-bound
                 variables, even in the face of recursion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Li:2017:SDC,
  author =       "Huisong Li and Fran{\c{c}}ois Berenger and Bor-Yuh
                 Evan Chang and Xavier Rival",
  title =        "Semantic-directed clumping of disjunctive abstract
                 states",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "32--45",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009881",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To infer complex structural invariants, shape analyses
                 rely on expressive families of logical properties. Many
                 such analyses manipulate abstract memory states that
                 consist of separating conjunctions of basic predicates
                 describing atomic blocks or summaries. Moreover, they
                 use finite disjunctions of abstract memory states in
                 order to account for dissimilar shapes. Disjunctions
                 should be kept small for the sake of scalability,
                 though precision often requires to keep additional case
                 splits. In this context, deciding when and how to merge
                 case splits and to replace them with summaries is
                 critical both for the precision and for the efficiency.
                 Existing techniques use sets of syntactic rules, which
                 are tedious to design and prone to failure. In this
                 paper, we design a semantic criterion to clump abstract
                 states based on their silhouette which applies not only
                 to the conservative union of disjuncts, but also to the
                 weakening of separating conjunction of memory
                 predicates into inductive summaries. Our approach
                 allows to define union and widening operators that aim
                 at preserving the case splits that are required for the
                 analysis to succeed. We implement this approach in the
                 MemCAD analyzer, and evaluate it on real-world C codes
                 from existing libraries, including programs dealing
                 with doubly linked lists, red-black trees and
                 AVL-trees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Singh:2017:FPA,
  author =       "Gagandeep Singh and Markus P{\"u}schel and Martin
                 Vechev",
  title =        "Fast polyhedra abstract domain",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "46--59",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009885",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Numerical abstract domains are an important ingredient
                 of modern static analyzers used for verifying critical
                 program properties (e.g., absence of buffer overflow or
                 memory safety). Among the many numerical domains
                 introduced over the years, Polyhedra is the most
                 expressive one, but also the most expensive: it has
                 worst-case exponential space and time complexity. As a
                 consequence, static analysis with the Polyhedra domain
                 is thought to be impractical when applied to large
                 scale, real world programs. In this paper, we present a
                 new approach and a complete implementation for speeding
                 up Polyhedra domain analysis. Our approach does not
                 lose precision, and for many practical cases, is orders
                 of magnitude faster than state-of-the-art solutions.
                 The key insight underlying our work is that polyhedra
                 arising during analysis can usually be kept decomposed,
                 thus considerably reducing the overall complexity. We
                 first present the theory underlying our approach, which
                 identifies the interaction between partitions of
                 variables and domain operators. Based on the theory we
                 develop new algorithms for these operators that work
                 with decomposed polyhedra. We implemented these
                 algorithms using the same interface as existing
                 libraries, thus enabling static analyzers to use our
                 implementation with little effort. In our evaluation,
                 we analyze large benchmarks from the popular software
                 verification competition, including Linux device
                 drivers with over 50K lines of code. Our experimental
                 results demonstrate massive gains in both space and
                 time: we show end-to-end speedups of two to five orders
                 of magnitude compared to state-of-the-art Polyhedra
                 implementations as well as significant memory gains, on
                 all larger benchmarks. In fact, in many cases our
                 analysis terminates in seconds where prior code runs
                 out of memory or times out after 4 hours. We believe
                 this work is an important step in making the Polyhedra
                 abstract domain both feasible and practically usable
                 for handling large, real-world programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Dolan:2017:PST,
  author =       "Stephen Dolan and Alan Mycroft",
  title =        "Polymorphism, subtyping, and type inference in
                 {MLsub}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "60--72",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009882",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a type system combining subtyping and
                 ML-style parametric polymorphism. Unlike previous work,
                 our system supports type inference and has compact
                 principal types. We demonstrate this system in the
                 minimal language MLsub, which types a strict superset
                 of core ML programs. This is made possible by keeping a
                 strict separation between the types used to describe
                 inputs and those used to describe outputs, and
                 extending the classical unification algorithm to handle
                 subtyping constraints between these input and output
                 types. Principal types are kept compact by type
                 simplification, which exploits deep connections between
                 subtyping and the algebra of regular languages. An
                 implementation is available online.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Grigore:2017:JGT,
  author =       "Radu Grigore",
  title =        "{Java} generics are {Turing} complete",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "73--85",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009871",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes a reduction from the halting
                 problem of Turing machines to subtype checking in Java.
                 It follows that subtype checking in Java is
                 undecidable, which answers a question posed by Kennedy
                 and Pierce in 2007. It also follows that Java's type
                 checker can recognize any recursive language, which
                 improves a result of Gill and Levy from 2016. The
                 latter point is illustrated by a parser generator for
                 fluent interfaces.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Omar:2017:HBT,
  author =       "Cyrus Omar and Ian Voysey and Michael Hilton and
                 Jonathan Aldrich and Matthew A. Hammer",
  title =        "{Hazelnut}: a bidirectionally typed structure editor
                 calculus",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "86--99",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009900",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Structure editors allow programmers to edit the tree
                 structure of a program directly. This can have
                 cognitive benefits, particularly for novice and
                 end-user programmers. It also simplifies matters for
                 tool designers, because they do not need to contend
                 with malformed program text. This paper introduces
                 Hazelnut, a structure editor based on a small
                 bidirectionally typed lambda calculus extended with
                 holes and a cursor. Hazelnut goes one step beyond
                 syntactic well-formedness: its edit actions operate
                 over statically meaningful incomplete terms.
                 Na{\~A}{\=v}ely, this would force the programmer to
                 construct terms in a rigid ``outside-in'' manner. To
                 avoid this problem, the action semantics automatically
                 places terms assigned a type that is inconsistent with
                 the expected type inside a hole. This meaningfully
                 defers the type consistency check until the term inside
                 the hole is finished. Hazelnut is not intended as an
                 end-user tool itself. Instead, it serves as a
                 foundational account of typed structure editing. To
                 that end, we describe how Hazelnut's rich metatheory,
                 which we have mechanized using the Agda proof
                 assistant, serves as a guide when we extend the
                 calculus to include binary sum types. We also discuss
                 various interpretations of holes, and in so doing
                 reveal connections with gradual typing and contextual
                 modal type theory, the Curry--Howard interpretation of
                 contextual modal logic. Finally, we discuss how
                 Hazelnut's semantics lends itself to implementation as
                 an event-based functional reactive program. Our simple
                 reference implementation is written using
                 js_of_ocaml.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Crary:2017:MAP,
  author =       "Karl Crary",
  title =        "Modules, abstraction, and parametric polymorphism",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "100--113",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009892",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reynolds's Abstraction theorem forms the mathematical
                 foundation for data abstraction. His setting was the
                 polymorphic lambda calculus. Today, many modern
                 languages, such as the ML family, employ rich module
                 systems designed to give more expressive support for
                 data abstraction than the polymorphic lambda calculus,
                 but analogues of the Abstraction theorem for such
                 module systems have lagged far behind. We give an
                 account of the Abstraction theorem for a modern module
                 calculus supporting generative and applicative
                 functors, higher-order functors, sealing, and
                 translucent signatures. The main issues to be overcome
                 are: (1) the fact that modules combine both types and
                 terms, so they must be treated as both simultaneously,
                 (2) the effect discipline that models the distinction
                 between transparent and opaque modules, and (3) a very
                 rich language of type constructors supporting singleton
                 kinds. We define logical equivalence for modules and
                 show that it coincides with contextual equivalence.
                 This substantiates the folk theorem that modules are
                 good for data abstraction. All our proofs are
                 formalized in Coq.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Lampropoulos:2017:BLL,
  author =       "Leonidas Lampropoulos and Diane Gallois-Wong and
                 Catalin Hritcu and John Hughes and Benjamin C. Pierce
                 and Li-yao Xia",
  title =        "Beginner's luck: a language for property-based
                 generators",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "114--129",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009868",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Property-based random testing {\`a} la QuickCheck
                 requires building efficient generators for
                 well-distributed random data satisfying complex logical
                 predicates, but writing these generators can be
                 difficult and error prone. We propose a domain-specific
                 language in which generators are conveniently expressed
                 by decorating predicates with lightweight annotations
                 to control both the distribution of generated values
                 and the amount of constraint solving that happens
                 before each variable is instantiated. This language,
                 called Luck, makes generators easier to write, read,
                 and maintain. We give Luck a formal semantics and prove
                 several fundamental properties, including the soundness
                 and completeness of random generation with respect to a
                 standard predicate semantics. We evaluate Luck on
                 common examples from the property-based testing
                 literature and on two significant case studies, showing
                 that it can be used in complex domains with comparable
                 bug-finding effectiveness and a significant reduction
                 in testing code size compared to handwritten
                 generators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Shan:2017:EBI,
  author =       "Chung-chieh Shan and Norman Ramsey",
  title =        "Exact {Bayesian} inference by symbolic
                 disintegration",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "130--144",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009852",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Bayesian inference, of posterior knowledge from prior
                 knowledge and observed evidence, is typically defined
                 by Bayes's rule, which says the posterior multiplied by
                 the probability of an observation equals a joint
                 probability. But the observation of a continuous
                 quantity usually has probability zero, in which case
                 Bayes's rule says only that the unknown times zero is
                 zero. To infer a posterior distribution from a
                 zero-probability observation, the statistical notion of
                 disintegration tells us to specify the observation as
                 an expression rather than a predicate, but does not
                 tell us how to compute the posterior. We present the
                 first method of computing a disintegration from a
                 probabilistic program and an expression of a quantity
                 to be observed, even when the observation has
                 probability zero. Because the method produces an exact
                 posterior term and preserves a semantics in which
                 monadic terms denote measures, it composes with other
                 inference methods in a modular way --- without
                 sacrificing accuracy or performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Chatterjee:2017:SIP,
  author =       "Krishnendu Chatterjee and Petr Novotn{\'y} and
                 {\Eth}orde Zikeli{\'c}",
  title =        "Stochastic invariants for probabilistic termination",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "145--160",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009873",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Termination is one of the basic liveness properties,
                 and we study the termination problem for probabilistic
                 programs with real-valued variables. Previous works
                 focused on the qualitative problem that asks whether an
                 input program terminates with probability{\^A} 1
                 (almost-sure termination). A powerful approach for this
                 qualitative problem is the notion of ranking
                 supermartingales with respect to a given set of
                 invariants. The quantitative problem (probabilistic
                 termination) asks for bounds on the termination
                 probability, and this problem has not been addressed
                 yet. A fundamental and conceptual drawback of the
                 existing approaches to address probabilistic
                 termination is that even though the supermartingales
                 consider the probabilistic behaviour of the programs,
                 the invariants are obtained completely ignoring the
                 probabilistic aspect (i.e., the invariants are obtained
                 considering all behaviours with no information about
                 the probability). In this work we address the
                 probabilistic termination problem for linear-arithmetic
                 probabilistic programs with nondeterminism. We formally
                 define the notion of stochastic invariants, which are
                 constraints along with a probability bound that the
                 constraints hold. We introduce a concept of repulsing
                 supermartingales. First, we show that repulsing
                 supermartingales can be used to obtain bounds on the
                 probability of the stochastic invariants. Second, we
                 show the effectiveness of repulsing supermartingales in
                 the following three ways: (1){\^A} With a combination
                 of ranking and repulsing supermartingales we can
                 compute lower bounds on the probability of termination;
                 (2){\^A} repulsing supermartingales provide witnesses
                 for refutation of almost-sure termination; and (3){\^A}
                 with a combination of ranking and repulsing
                 supermartingales we can establish persistence
                 properties of probabilistic programs. Along with our
                 conceptual contributions, we establish the following
                 computational results: First, the synthesis of a
                 stochastic invariant which supports some ranking
                 supermartingale and at the same time admits a repulsing
                 supermartingale can be achieved via reduction to the
                 existential first-order theory of reals, which
                 generalizes existing results from the non-probabilistic
                 setting. Second, given a program with ``strict
                 invariants'' (e.g., obtained via abstract
                 interpretation) and a stochastic invariant, we can
                 check in polynomial time whether there exists a linear
                 repulsing supermartingale w.r.t. the stochastic
                 invariant (via reduction to LP). We also present
                 experimental evaluation of our approach on academic
                 examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Barthe:2017:CPP,
  author =       "Gilles Barthe and Benjamin Gr{\'e}goire and Justin Hsu
                 and Pierre-Yves Strub",
  title =        "Coupling proofs are probabilistic product programs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "161--174",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009896",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Couplings are a powerful mathematical tool for
                 reasoning about pairs of probabilistic processes.
                 Recent developments in formal verification identify a
                 close connection between couplings and pRHL, a
                 relational program logic motivated by applications to
                 provable security, enabling formal construction of
                 couplings from the probability theory literature.
                 However, existing work using pRHL merely shows
                 existence of a coupling and does not give a way to
                 prove quantitative properties about the coupling,
                 needed to reason about mixing and convergence of
                 probabilistic processes. Furthermore, pRHL is
                 inherently incomplete, and is not able to capture some
                 advanced forms of couplings such as shift couplings. We
                 address both problems as follows. First, we define an
                 extension of pRHL, called x-pRHL, which explicitly
                 constructs the coupling in a pRHL derivation in the
                 form of a probabilistic product program that simulates
                 two correlated runs of the original program. Existing
                 verification tools for probabilistic programs can then
                 be directly applied to the probabilistic product to
                 prove quantitative properties of the coupling. Second,
                 we equip x-pRHL with a new rule for while loops, where
                 reasoning can freely mix synchronized and
                 unsynchronized loop iterations. Our proof rule can
                 capture examples of shift couplings, and the logic is
                 relatively complete for deterministic programs. We show
                 soundness of x-PRHL and use it to analyze two classes
                 of examples. First, we verify rapid mixing using
                 different tools from coupling: standard coupling, shift
                 coupling, and path coupling, a compositional principle
                 for combining local couplings into a global coupling.
                 Second, we verify (approximate) equivalence between a
                 source and an optimized program for several instances
                 of loop optimizations from the literature.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Kang:2017:PSR,
  author =       "Jeehoon Kang and Chung-Kil Hur and Ori Lahav and
                 Viktor Vafeiadis and Derek Dreyer",
  title =        "A promising semantics for relaxed-memory concurrency",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "175--189",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009850",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite many years of research, it has proven very
                 difficult to develop a memory model for concurrent
                 programming languages that adequately balances the
                 conflicting desiderata of programmers, compilers, and
                 hardware. In this paper, we propose the first relaxed
                 memory model that (1) accounts for a broad spectrum of
                 features from the C++11 concurrency model, (2) is
                 implementable, in the sense that it provably validates
                 many standard compiler optimizations and reorderings,
                 as well as standard compilation schemes to x86-TSO and
                 Power, (3) justifies simple invariant-based reasoning,
                 thus demonstrating the absence of bad
                 ``out-of-thin-air'' behaviors, (4) supports ``DRF''
                 guarantees, ensuring that programmers who use
                 sufficient synchronization need not understand the full
                 complexities of relaxed-memory semantics, and (5)
                 defines the semantics of racy programs without relying
                 on undefined behaviors, which is a prerequisite for
                 applicability to type-safe languages like Java. The key
                 novel idea behind our model is the notion of
                 *promises*: a thread may promise to execute a write in
                 the future, thus enabling other threads to read from
                 that write out of order. Crucially, to prevent
                 out-of-thin-air behaviors, a promise step requires a
                 thread-local certification that it will be possible to
                 execute the promised write even in the absence of the
                 promise. To establish confidence in our model, we have
                 formalized most of our key results in Coq.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Wickerson:2017:ACM,
  author =       "John Wickerson and Mark Batty and Tyler Sorensen and
                 George A. Constantinides",
  title =        "Automatically comparing memory consistency models",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "190--204",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009838",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A memory consistency model (MCM) is the part of a
                 programming language or computer architecture
                 specification that defines which values can legally be
                 read from shared memory locations. Because MCMs take
                 into account various optimisations employed by
                 architectures and compilers, they are often complex and
                 counterintuitive, which makes them challenging to
                 design and to understand. We identify four tasks
                 involved in designing and understanding MCMs:
                 generating conformance tests, distinguishing two MCMs,
                 checking compiler optimisations, and checking compiler
                 mappings. We show that all four tasks are instances of
                 a general constraint-satisfaction problem to which the
                 solution is either a program or a pair of programs.
                 Although this problem is intractable for automatic
                 solvers when phrased over programs directly, we show
                 how to solve analogous constraints over program
                 executions, and then construct programs that satisfy
                 the original constraints. Our technique, which is
                 implemented in the Alloy modelling framework, is
                 illustrated on several software- and architecture-level
                 MCMs, both axiomatically and operationally defined. We
                 automatically recreate several known results, often in
                 a simpler form, including: distinctions between
                 variants of the C11 MCM; a failure of the `SC-DRF
                 guarantee' in an early C11 draft; that x86 is
                 `multi-copy atomic' and Power is not; bugs in common
                 C11 compiler optimisations; and bugs in a compiler
                 mapping from OpenCL to AMD-style GPUs. We also use our
                 technique to develop and validate a new MCM for NVIDIA
                 GPUs that supports a natural mapping from OpenCL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Krebbers:2017:IPH,
  author =       "Robbert Krebbers and Amin Timany and Lars Birkedal",
  title =        "Interactive proofs in higher-order concurrent
                 separation logic",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "205--217",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009855",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When using a proof assistant to reason in an embedded
                 logic --- like separation logic --- one cannot benefit
                 from the proof contexts and basic tactics of the proof
                 assistant. This results in proofs that are at a too low
                 level of abstraction because they are cluttered with
                 bookkeeping code related to manipulating the object
                 logic. In this paper, we introduce a so-called proof
                 mode that extends the Coq proof assistant with (spatial
                 and non-spatial) named proof contexts for the object
                 logic. We show that thanks to these contexts we can
                 implement high-level tactics for introduction and
                 elimination of the connectives of the object logic, and
                 thereby make reasoning in the embedded logic as
                 seamless as reasoning in the meta logic of the proof
                 assistant. We apply our method to Iris: a state of the
                 art higher-order impredicative concurrent separation
                 logic. We show that our method is very general, and is
                 not just limited to program verification. We
                 demonstrate its generality by formalizing correctness
                 proofs of fine-grained concurrent algorithms, derived
                 constructs of the Iris logic, and a unary and binary
                 logical relation for a language with concurrency,
                 higher-order store, polymorphism, and recursive types.
                 This is the first formalization of a binary logical
                 relation for such an expressive language. We also show
                 how to use the logical relation to prove contextual
                 refinement of fine-grained concurrent algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Krogh-Jespersen:2017:RMT,
  author =       "Morten Krogh-Jespersen and Kasper Svendsen and Lars
                 Birkedal",
  title =        "A relational model of types-and-effects in
                 higher-order concurrent separation logic",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "218--231",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009877",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recently we have seen a renewed interest in
                 programming languages that tame the complexity of state
                 and concurrency through refined type systems with more
                 fine-grained control over effects. In addition to
                 simplifying reasoning and eliminating whole classes of
                 bugs, statically tracking effects opens the door to
                 advanced compiler optimizations. In this paper we
                 present a relational model of a type-and-effect system
                 for a higher-order, concurrent programming language.
                 The model precisely captures the semantic invariants
                 expressed by the effect annotations. We demonstrate
                 that these invariants are strong enough to prove
                 advanced program transformations, including automatic
                 parallelization of expressions with suitably disjoint
                 effects. The model also supports refinement proofs
                 between abstract data types implementations with
                 different internal data representations, including
                 proofs that fine-grained concurrent algorithms refine
                 their coarse-grained counterparts. This is the first
                 model for such an expressive language that supports
                 both effect-based optimizations and data abstraction.
                 The logical relation is defined in Iris, a
                 state-of-the-art higher-order concurrent separation
                 logic. This greatly simplifies proving well-definedness
                 of the logical relation and also provides us with a
                 powerful logic for reasoning in the model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{DAntoni:2017:MSO,
  author =       "Loris D'Antoni and Margus Veanes",
  title =        "Monadic second-order logic on finite sequences",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "232--245",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009844",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We extend the weak monadic second-order logic of one
                 successor on finite strings (M2L-STR) to symbolic
                 alphabets by allowing character predicates to range
                 over decidable quantifier free theories instead of
                 finite alphabets. We call this logic, which is able to
                 describe sequences over complex and potentially
                 infinite domains, symbolic M2L-STR (S-M2L-STR). We then
                 present a decision procedure for S-M2L-STR based on a
                 reduction to symbolic finite automata, a decidable
                 extension of finite automata that allows transitions to
                 carry predicates and can therefore model symbolic
                 alphabets. The reduction constructs a symbolic
                 automaton over an alphabet consisting of pairs of
                 symbols where the first element of the pair is a symbol
                 in the original formula's alphabet, while the second
                 element is a bit-vector. To handle this modified
                 alphabet we show that the Cartesian product of two
                 decidable Boolean algebras (e.g., the formula's one and
                 the bit-vector's one) also forms a decidable Boolean
                 algebras. To make the decision procedure practical, we
                 propose two efficient representations of the Cartesian
                 product of two Boolean algebras, one based on algebraic
                 decision diagrams and one on a variant of Shannon
                 expansions. Finally, we implement our decision
                 procedure and evaluate it on more than 10,000 formulas.
                 Despite the generality, our implementation has
                 comparable performance with the state-of-the-art
                 M2L-STR solvers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Kobayashi:2017:RBH,
  author =       "Naoki Kobayashi and {\'E}tienne Lozes and Florian
                 Bruse",
  title =        "On the relationship between higher-order recursion
                 schemes and higher-order fixpoint logic",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "246--259",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009854",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We study the relationship between two kinds of
                 higher-order extensions of model checking: HORS model
                 checking, where models are extended to higher-order
                 recursion schemes, and HFL model checking, where the
                 logic is extended to higher-order modal fixpoint logic.
                 Those extensions have been independently studied until
                 recently, and the former has been applied to
                 higher-order program verification. We show that there
                 exist (arguably) natural reductions between the two
                 problems. To prove the correctness of the translation
                 from HORS to HFL model checking, we establish a
                 type-based characterization of HFL model checking,
                 which should be of independent interest. The results
                 reveal a close relationship between the two problems,
                 enabling cross-fertilization of the two research
                 threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Kovacs:2017:CTQ,
  author =       "Laura Kov{\'a}cs and Simon Robillard and Andrei
                 Voronkov",
  title =        "Coming to terms with quantified reasoning",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "260--270",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009887",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The theory of finite term algebras provides a natural
                 framework to describe the semantics of functional
                 languages. The ability to efficiently reason about term
                 algebras is essential to automate program analysis and
                 verification for functional or imperative programs over
                 inductively defined data types such as lists and trees.
                 However, as the theory of finite term algebras is not
                 finitely axiomatizable, reasoning about quantified
                 properties over term algebras is challenging. In this
                 paper we address full first-order reasoning about
                 properties of programs manipulating term algebras, and
                 describe two approaches for doing so by using
                 first-order theorem proving. Our first method is a
                 conservative extension of the theory of term algebras
                 using a finite number of statements, while our second
                 method relies on extending the superposition calculus
                 of first-order theorem provers with additional
                 inference rules. We implemented our work in the
                 first-order theorem prover Vampire and evaluated it on
                 a large number of inductive datatype benchmarks, as
                 well as game theory constraints. Our experimental
                 results show that our methods are able to find proofs
                 for many hard problems previously unsolved by
                 state-of-the-art methods. We also show that Vampire
                 implementing our methods outperforms existing SMT
                 solvers able to deal with inductive data types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Scully:2017:POA,
  author =       "Ziv Scully and Adam Chlipala",
  title =        "A program optimization for automatic database result
                 caching",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "271--284",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009891",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most popular Web applications rely on persistent
                 databases based on languages like SQL for declarative
                 specification of data models and the operations that
                 read and modify them. As applications scale up in user
                 base, they often face challenges responding quickly
                 enough to the high volume of requests. A common aid is
                 caching of database results in the application's memory
                 space, taking advantage of program-specific knowledge
                 of which caching schemes are sound and useful, embodied
                 in handwritten modifications that make the program less
                 maintainable. These modifications also require
                 nontrivial reasoning about the read-write dependencies
                 across operations. In this paper, we present a compiler
                 optimization that automatically adds sound SQL caching
                 to Web applications coded in the Ur/Web domain-specific
                 functional language, with no modifications required to
                 source code. We use a custom cache implementation that
                 supports concurrent operations without compromising the
                 transactional semantics of the database abstraction.
                 Through experiments with microbenchmarks and production
                 Ur/Web applications, we show that our optimization in
                 many cases enables an easy doubling or more of an
                 application's throughput, requiring nothing more than
                 passing an extra command-line flag to the compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Kiselyov:2017:SFC,
  author =       "Oleg Kiselyov and Aggelos Biboudis and Nick Palladinos
                 and Yannis Smaragdakis",
  title =        "Stream fusion, to completeness",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "285--299",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009880",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Stream processing is mainstream (again): Widely-used
                 stream libraries are now available for virtually all
                 modern OO and functional languages, from Java to C\# to
                 Scala to OCaml to Haskell. Yet expressivity and
                 performance are still lacking. For instance, the
                 popular, well-optimized Java 8 streams do not support
                 the zip operator and are still an order of magnitude
                 slower than hand-written loops. We present the first
                 approach that represents the full generality of stream
                 processing and eliminates overheads, via the use of
                 staging. It is based on an unusually rich semantic
                 model of stream interaction. We support any combination
                 of zipping, nesting (or flat-mapping), sub-ranging,
                 filtering, mapping-of finite or infinite streams. Our
                 model captures idiosyncrasies that a programmer uses in
                 optimizing stream pipelines, such as rate differences
                 and the choice of a ``for'' vs. ``while'' loops. Our
                 approach delivers hand-written-like code, but
                 automatically. It explicitly avoids the reliance on
                 black-box optimizers and sufficiently-smart compilers,
                 offering highest, guaranteed and portable performance.
                 Our approach relies on high-level concepts that are
                 then readily mapped into an implementation.
                 Accordingly, we have two distinct implementations: an
                 OCaml stream library, staged via MetaOCaml, and a Scala
                 library for the JVM, staged via LMS. In both cases, we
                 derive libraries richer and simultaneously many tens of
                 times faster than past work. We greatly exceed in
                 performance the standard stream libraries available in
                 Java, Scala and OCaml, including the well-optimized
                 Java 8 streams.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Chiang:2017:RFP,
  author =       "Wei-Fan Chiang and Mark Baranowski and Ian Briggs and
                 Alexey Solovyev and Ganesh Gopalakrishnan and Zvonimir
                 Rakamari{\'c}",
  title =        "Rigorous floating-point mixed-precision tuning",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "300--315",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009846",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Virtually all real-valued computations are carried out
                 using floating-point data types and operations. The
                 precision of these data types must be set with the
                 goals of reducing the overall round-off error, but also
                 emphasizing performance improvements. Often, a
                 mixed-precision allocation achieves this optimum;
                 unfortunately, there are no techniques available to
                 compute such allocations and conservatively meet a
                 given error target across all program inputs. In this
                 work, we present a rigorous approach to precision
                 allocation based on formal analysis via Symbolic Taylor
                 Expansions, and error analysis based on interval
                 functions. This approach is implemented in an automated
                 tool called FPTuner that generates and solves a
                 quadratically constrained quadratic program to obtain a
                 precision-annotated version of the given expression.
                 FPTuner automatically introduces all the requisite
                 precision up and down casting operations. It also
                 allows users to flexibly control precision allocation
                 using constraints to cap the number of high precision
                 operators as well as group operators to allocate the
                 same precision to facilitate vectorization. We evaluate
                 FPTuner by tuning several benchmarks and measuring the
                 proportion of lower precision operators allocated as we
                 increase the error threshold. We also measure the
                 reduction in energy consumption resulting from
                 executing mixed-precision tuned code on a real hardware
                 platform. We observe significant energy savings in
                 response to mixed-precision tuning, but also observe
                 situations where unexpected compiler behaviors thwart
                 intended optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Cicek:2017:RCA,
  author =       "Ezgi {\c{C}}i{\c{c}}ek and Gilles Barthe and Marco
                 Gaboardi and Deepak Garg and Jan Hoffmann",
  title =        "Relational cost analysis",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "316--329",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009858",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Establishing quantitative bounds on the execution cost
                 of programs is essential in many areas of computer
                 science such as complexity analysis, compiler
                 optimizations, security and privacy. Techniques based
                 on program analysis, type systems and abstract
                 interpretation are well-studied, but methods for
                 analyzing how the execution costs of two programs
                 compare to each other have not received attention.
                 Naively combining the worst and best case execution
                 costs of the two programs does not work well in many
                 cases because such analysis forgets the similarities
                 between the programs or the inputs. In this work, we
                 propose a relational cost analysis technique that is
                 capable of establishing precise bounds on the
                 difference in the execution cost of two programs by
                 making use of relational properties of programs and
                 inputs. We develop , a refinement type and effect
                 system for a higher-order functional language with
                 recursion and subtyping. The key novelty of our
                 technique is the combination of relational refinements
                 with two modes of typing --- relational typing for
                 reasoning about similar computations/inputs and unary
                 typing for reasoning about unrelated
                 computations/inputs. This combination allows us to
                 analyze the execution cost difference of two programs
                 more precisely than a naive non-relational approach. We
                 prove our type system sound using a semantic model
                 based on step-indexed unary and binary logical
                 relations accounting for non-relational and relational
                 reasoning principles with their respective costs. We
                 demonstrate the precision and generality of our
                 technique through examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Madhavan:2017:CBR,
  author =       "Ravichandhran Madhavan and Sumith Kulal and Viktor
                 Kuncak",
  title =        "Contract-based resource verification for higher-order
                 functions with memoization",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "330--343",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009874",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a new approach for specifying and verifying
                 resource utilization of higher-order functional
                 programs that use lazy evaluation and memoization. In
                 our approach, users can specify the desired resource
                 bound as templates with numerical holes e.g. as steps
                 \leq ? * size(l) + ? in the contracts of functions.
                 They can also express invariants necessary for
                 establishing the bounds that may depend on the state of
                 memoization. Our approach operates in two phases: first
                 generating an instrumented first-order program that
                 accurately models the higher-order control flow and the
                 effects of memoization on resources using sets,
                 algebraic datatypes and mutual recursion, and then
                 verifying the contracts of the first-order program by
                 producing verification conditions of the form $ \exists
                 \forall $ using an extended assume/guarantee reasoning.
                 We use our approach to verify precise bounds on
                 resources such as evaluation steps and number of
                 heap-allocated objects on 17 challenging data
                 structures and algorithms. Our benchmarks, comprising
                 of 5K lines of functional Scala code, include lazy
                 mergesort, Okasaki's real-time queue and deque data
                 structures that rely on aliasing of references to
                 first-class functions; lazy data structures based on
                 numerical representations such as the conqueue data
                 structure of Scala's data-parallel library, cyclic
                 streams, as well as dynamic programming algorithms such
                 as knapsack and Viterbi. Our evaluations show that when
                 averaged over all benchmarks the actual runtime
                 resource consumption is 80\% of the value inferred by
                 our tool when estimating the number of evaluation
                 steps, and is 88\% for the number of heap-allocated
                 objects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Zhang:2017:CSD,
  author =       "Qirun Zhang and Zhendong Su",
  title =        "Context-sensitive data-dependence analysis via linear
                 conjunctive language reachability",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "344--358",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009848",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many program analysis problems can be formulated as
                 graph reachability problems. In the literature,
                 context-free language (CFL) reachability has been the
                 most popular formulation and can be computed in
                 subcubic time. The context-sensitive data-dependence
                 analysis is a fundamental abstraction that can express
                 a broad range of program analysis problems. It
                 essentially describes an interleaved
                 matched-parenthesis language reachability problem. The
                 language is not context-free, and the problem is
                 well-known to be undecidable. In practice, many program
                 analyses adopt CFL-reachability to exactly model the
                 matched parentheses for either context-sensitivity or
                 structure-transmitted data-dependence, but not both.
                 Thus, the CFL-reachability formulation for
                 context-sensitive data-dependence analysis is
                 inherently an approximation. To support more precise
                 and scalable analyses, this paper introduces linear
                 conjunctive language (LCL) reachability, a new,
                 expressive class of graph reachability. LCL not only
                 contains the interleaved matched-parenthesis language,
                 but is also closed under all set-theoretic operations.
                 Given a graph with n nodes and m edges, we propose an O
                 ( mn ) time approximation algorithm for solving
                 all-pairs LCL-reachability, which is asymptotically
                 better than known CFL-reachability algorithms. Our
                 formulation and algorithm offer a new perspective on
                 attacking the aforementioned undecidable problem ---
                 the LCL-reachability formulation is exact, while the
                 LCL-reachability algorithm yields a sound
                 approximation. We have applied the LCL-reachability
                 framework to two existing client analyses. The
                 experimental results show that the LCL-reachability
                 framework is both more precise and scalable than the
                 traditional CFL-reachability framework. This paper
                 opens up the opportunity to exploit LCL-reachability in
                 program analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Hoffmann:2017:TAR,
  author =       "Jan Hoffmann and Ankush Das and Shu-Chun Weng",
  title =        "Towards automatic resource bound analysis for
                 {OCaml}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "359--373",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009842",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This article presents a resource analysis system for
                 OCaml programs. The system automatically derives
                 worst-case resource bounds for higher-order polymorphic
                 programs with user-defined inductive types. The
                 technique is parametric in the resource and can derive
                 bounds for time, memory allocations and energy usage.
                 The derived bounds are multivariate resource
                 polynomials which are functions of different size
                 parameters that depend on the standard OCaml types.
                 Bound inference is fully automatic and reduced to a
                 linear optimization problem that is passed to an
                 off-the-shelf LP solver. Technically, the analysis
                 system is based on a novel multivariate automatic
                 amortized resource analysis (AARA). It builds on
                 existing work on linear AARA for higher-order programs
                 with user-defined inductive types and on multivariate
                 AARA for first-order programs with built-in lists and
                 binary trees. This is the first amortized analysis,
                 that automatically derives polynomial bounds for
                 higher-order functions and polynomial bounds that
                 depend on user-defined inductive types. Moreover, the
                 analysis handles a limited form of side effects and
                 even outperforms the linear bound inference of previous
                 systems. At the same time, it preserves the
                 expressivity and efficiency of existing AARA
                 techniques. The practicality of the analysis system is
                 demonstrated with an implementation and integration
                 with Inria's OCaml compiler. The implementation is used
                 to automatically derive resource bounds for 411
                 functions and 6018 lines of code derived from OCaml
                 libraries, the CompCert compiler, and implementations
                 of textbook algorithms. In a case study, the system
                 infers bounds on the number of queries that are sent by
                 OCaml programs to DynamoDB, a commercial NoSQL cloud
                 database service.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Scherer:2017:DES,
  author =       "Gabriel Scherer",
  title =        "Deciding equivalence with sums and the empty type",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "374--386",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009901",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The logical technique of focusing can be applied to
                 the $ \lambda $ -calculus; in a simple type system with
                 atomic types and negative type formers (functions,
                 products, the unit type), its normal forms coincide
                 with {\^I}$^{}^2${\^I}$ \cdot $-normal forms.
                 Introducing a saturation phase gives a notion of
                 quasi-normal forms in presence of positive types (sum
                 types and the empty type). This rich structure let us
                 prove the decidability of {\^I}$^{}^2${\^I}$ \cdot
                 $-equivalence in presence of the empty type, the fact
                 that it coincides with contextual equivalence, and with
                 set-theoretic equality in all finite models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Ilik:2017:ELN,
  author =       "Danko Ilik",
  title =        "The exp--log normal form of types: decomposing
                 extensional equality and representing terms compactly",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "387--399",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009841",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Lambda calculi with algebraic data types lie at the
                 core of functional programming languages and proof
                 assistants, but conceal at least two fundamental
                 theoretical problems already in the presence of the
                 simplest non-trivial data type, the sum type. First, we
                 do not know of an explicit and implemented algorithm
                 for deciding the beta-eta-equality of terms---and this
                 in spite of the first decidability results proven two
                 decades ago. Second, it is not clear how to decide when
                 two types are essentially the same, i.e. isomorphic, in
                 spite of the meta-theoretic results on decidability of
                 the isomorphism. In this paper, we present the exp-log
                 normal form of types---derived from the representation
                 of exponential polynomials via the unary exponential
                 and logarithmic functions---that any type built from
                 arrows, products, and sums, can be isomorphically
                 mapped to. The type normal form can be used as a simple
                 heuristic for deciding type isomorphism, thanks to the
                 fact that it is a systematic application of the
                 high-school identities. We then show that the type
                 normal form allows to reduce the standard beta-eta
                 equational theory of the lambda calculus to a
                 specialized version of itself, while preserving
                 completeness of the equality on terms. We end by
                 describing an alternative representation of normal
                 terms of the lambda calculus with sums, together with a
                 Coq-implemented converter into/from our new term
                 calculus. The difference with the only other previously
                 implemented heuristic for deciding interesting
                 instances of eta-equality by Balat, Di Cosmo, and
                 Fiore, is that we exploits the type information of
                 terms substantially and this often allows us to obtain
                 a canonical representation of terms without performing
                 a sophisticated term analyses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Levy:2017:CI,
  author =       "Paul Blain Levy",
  title =        "Contextual isomorphisms",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "400--414",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009898",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "What is the right notion of ``isomorphism'' between
                 types, in a simple type theory? The traditional answer
                 is: a pair of terms that are inverse up to a specified
                 congruence. We firstly argue that, in the presence of
                 effects, this answer is too liberal and needs to be
                 restricted, using F{\~A}$ 1 / 4 $ hrmann's notion of
                 thunkability in the case of value types (as in
                 call-by-value), or using Munch-Maccagnoni's notion of
                 linearity in the case of computation types (as in
                 call-by-name). Yet that leaves us with different
                 notions of isomorphism for different kinds of type.
                 This situation is resolved by means of a new notion of
                 ``contextual'' isomorphism (or morphism), analogous at
                 the level of types to contextual equivalence of terms.
                 A contextual morphism is a way of replacing one type
                 with the other wherever it may occur in a judgement, in
                 a way that is preserved by the action of any term with
                 holes. For types of pure $ \lambda $-calculus, we show
                 that a contextual morphism corresponds to a traditional
                 isomorphism. For value types, a contextual morphism
                 corresponds to a thunkable isomorphism, and for
                 computation types, to a linear isomorphism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Brown:2017:TSE,
  author =       "Matt Brown and Jens Palsberg",
  title =        "Typed self-evaluation via intensional type functions",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "415--428",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009853",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many popular languages have a self-interpreter, that
                 is, an interpreter for the language written in itself.
                 So far, work on polymorphically-typed self-interpreters
                 has concentrated on self-recognizers that merely
                 recover a program from its representation. A larger and
                 until now unsolved challenge is to implement a
                 polymorphically-typed self-evaluator that evaluates the
                 represented program and produces a representation of
                 the result. We present F$_\omega^{\mu i}$, the first $
                 \lambda $-calculus that supports a
                 polymorphically-typed self-evaluator. Our calculus
                 extends F$_\omega $ with recursive types and
                 intensional type functions and has decidable type
                 checking. Our key innovation is a novel implementation
                 of type equality proofs that enables us to define a
                 versatile representation of programs. Our results
                 establish a new category of languages that can support
                 polymorphically-typed self-evaluators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Flur:2017:MSC,
  author =       "Shaked Flur and Susmit Sarkar and Christopher Pulte
                 and Kyndylan Nienhuis and Luc Maranget and Kathryn E.
                 Gray and Ali Sezgin and Mark Batty and Peter Sewell",
  title =        "Mixed-size concurrency: {ARM}, {POWER}, {C\slash
                 C++11}, and {SC}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "429--442",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009839",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Previous work on the semantics of relaxed
                 shared-memory concurrency has only considered the case
                 in which each load reads the data of exactly one store.
                 In practice, however, multiprocessors support
                 mixed-size accesses, and these are used by systems
                 software and (to some degree) exposed at the C/C++
                 language level. A semantic foundation for software,
                 therefore, has to address them. We investigate the
                 mixed-size behaviour of ARMv8 and IBM POWER
                 architectures and implementations: by experiment, by
                 developing semantic models, by testing the
                 correspondence between these, and by discussion with
                 ARM and IBM staff. This turns out to be surprisingly
                 subtle, and on the way we have to revisit the
                 fundamental concepts of coherence and sequential
                 consistency, which change in this setting. In
                 particular, we show that adding a memory barrier
                 between each instruction does not restore sequential
                 consistency. We go on to extend the C/C++11 model to
                 support non-atomic mixed-size memory accesses. This is
                 a necessary step towards semantics for real-world
                 shared-memory concurrent code, beyond litmus tests.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Lidbury:2017:DRD,
  author =       "Christopher Lidbury and Alastair F. Donaldson",
  title =        "Dynamic race detection for {C++11}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "443--457",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009857",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The intricate rules for memory ordering and
                 synchronisation associated with the C/C++11 memory
                 model mean that data races can be difficult to
                 eliminate from concurrent programs. Dynamic data race
                 analysis can pinpoint races in large and complex
                 applications, but the state-of-the-art ThreadSanitizer
                 (tsan) tool for C/C++ considers only sequentially
                 consistent program executions, and does not correctly
                 model synchronisation between C/C++11 atomic
                 operations. We present a scalable dynamic data race
                 analysis for C/C++11 that correctly captures C/C++11
                 synchronisation, and uses instrumentation to support
                 exploration of a class of non sequentially consistent
                 executions. We concisely define the memory model
                 fragment captured by our instrumentation via a
                 restricted axiomatic semantics, and show that the
                 axiomatic semantics permits exactly those executions
                 explored by our instrumentation. We have implemented
                 our analysis in tsan, and evaluate its effectiveness on
                 benchmark programs, enabling a comparison with the
                 CDSChecker tool, and on two large and highly concurrent
                 applications: the Firefox and Chromium web browsers.
                 Our results show that our method can detect races that
                 are beyond the scope of the original tsan tool, and
                 that the overhead associated with applying our enhanced
                 instrumentation to large applications is tolerable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Brutschy:2017:SEC,
  author =       "Lucas Brutschy and Dimitar Dimitrov and Peter
                 M{\"u}ller and Martin Vechev",
  title =        "Serializability for eventual consistency: criterion,
                 analysis, and applications",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "458--472",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009895",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Developing and reasoning about systems using
                 eventually consistent data stores is a difficult
                 challenge due to the presence of unexpected behaviors
                 that do not occur under sequential consistency. A
                 fundamental problem in this setting is to identify a
                 correctness criterion that precisely captures intended
                 application behaviors yet is generic enough to be
                 applicable to a wide range of applications. In this
                 paper, we present such a criterion. More precisely, we
                 generalize conflict serializability to the setting of
                 eventual consistency. Our generalization is based on a
                 novel dependency model that incorporates two powerful
                 algebraic properties: commutativity and absorption.
                 These properties enable precise reasoning about
                 programs that employ high-level replicated data types,
                 common in modern systems. To apply our criterion in
                 practice, we also developed a dynamic analysis
                 algorithm and a tool that checks whether a given
                 program execution is serializable. We performed a
                 thorough experimental evaluation on two real-world use
                 cases: debugging cloud-backed mobile applications and
                 implementing clients of a popular eventually consistent
                 key-value store. Our experimental results indicate that
                 our criterion reveals harmful synchronization problems
                 in applications, is more effective at finding them than
                 prior approaches, and can be used for the development
                 of practical, eventually consistent applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Hoenicke:2017:TMM,
  author =       "Jochen Hoenicke and Rupak Majumdar and Andreas
                 Podelski",
  title =        "Thread modularity at many levels: a pearl in
                 compositional verification",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "473--485",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009893",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A thread-modular proof for the correctness of a
                 concurrent program is based on an inductive and
                 interference-free annotation of each thread. It is
                 well-known that the corresponding proof system is not
                 complete (unless one adds auxiliary variables). We
                 describe a hierarchy of proof systems where each level
                 k corresponds to a generalized notion of thread
                 modularity (level 1 corresponds to the original
                 notion). Each level is strictly more expressive than
                 the previous. Further, each level precisely captures
                 programs that can be proved using uniform Ashcroft
                 invariants with k universal quantifiers. We demonstrate
                 the usefulness of the hierarchy by giving a
                 compositional proof of the Mach shootdown algorithm for
                 TLB consistency. We show a proof at level 2 that shows
                 the algorithm is correct for an arbitrary number of
                 CPUs. However, there is no proof for the algorithm at
                 level 1 which does not involve auxiliary state.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Leijen:2017:TDC,
  author =       "Daan Leijen",
  title =        "Type directed compilation of row-typed algebraic
                 effects",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "486--499",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009872",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Algebraic effect handlers, introduced by Plotkin and
                 Power in 2002, are recently gaining in popularity as a
                 purely functional approach to modeling effects. In this
                 article, we give a full overview of practical algebraic
                 effects in the context of a compiled implementation in
                 the Koka language. In particular, we show how algebraic
                 effects generalize over common constructs like
                 exception handling, state, iterators and async-await.
                 We give an effective type inference algorithm based on
                 extensible effect rows using scoped labels, and a
                 direct operational semantics. Finally, we show an
                 efficient compilation scheme to common runtime
                 platforms (like JavaScript) using a type directed
                 selective CPS translation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Lindley:2017:DDD,
  author =       "Sam Lindley and Conor McBride and Craig McLaughlin",
  title =        "Do be do be do",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "500--514",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009897",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We explore the design and implementation of Frank, a
                 strict functional programming language with a
                 bidirectional effect type system designed from the
                 ground up around a novel variant of Plotkin and
                 Pretnar's effect handler abstraction. Effect handlers
                 provide an abstraction for modular effectful
                 programming: a handler acts as an interpreter for a
                 collection of commands whose interfaces are statically
                 tracked by the type system. However, Frank eliminates
                 the need for an additional effect handling construct by
                 generalising the basic mechanism of functional
                 abstraction itself. A function is simply the special
                 case of a Frank operator that interprets no commands.
                 Moreover, Frank's operators can be multihandlers which
                 simultaneously interpret commands from several sources
                 at once, without disturbing the direct style of
                 functional programming with values. Effect typing in
                 Frank employs a novel form of effect polymorphism which
                 avoid mentioning effect variables in source code. This
                 is achieved by propagating an ambient ability inwards,
                 rather than accumulating unions of potential effects
                 outwards. We introduce Frank by example, and then give
                 a formal account of the Frank type system and its
                 semantics. We introduce Core Frank by elaborating Frank
                 operators into functions, case expressions, and unary
                 handlers, and then give a sound small-step operational
                 semantics for Core Frank. Programming with effects and
                 handlers is in its infancy. We contribute an
                 exploration of future possibilities, particularly in
                 combination with other forms of rich type system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Ahman:2017:DMF,
  author =       "Danel Ahman and Catalin Hritcu and Kenji Maillard and
                 Guido Mart{\'\i}nez and Gordon Plotkin and Jonathan
                 Protzenko and Aseem Rastogi and Nikhil Swamy",
  title =        "{Dijkstra} monads for free",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "515--529",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009878",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dijkstra monads enable a dependent type theory to be
                 enhanced with support for specifying and verifying
                 effectful code via weakest preconditions. Together with
                 their closely related counterparts, Hoare monads, they
                 provide the basis on which verification tools like F*,
                 Hoare Type Theory (HTT), and Ynot are built. We show
                 that Dijkstra monads can be derived ``for free'' by
                 applying a continuation-passing style (CPS) translation
                 to the standard monadic definitions of the underlying
                 computational effects. Automatically deriving Dijkstra
                 monads in this way provides a correct-by-construction
                 and efficient way of reasoning about user-defined
                 effects in dependent type theories. We demonstrate
                 these ideas in EMF*, a new dependently typed calculus,
                 validating it via both formal proof and a prototype
                 implementation within F*. Besides equipping F* with a
                 more uniform and extensible effect system, EMF* enables
                 a novel mixture of intrinsic and extrinsic proofs
                 within F*.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Sekiyama:2017:SMC,
  author =       "Taro Sekiyama and Atsushi Igarashi",
  title =        "Stateful manifest contracts",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "530--544",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009875",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper studies hybrid contract verification for an
                 imperative higher-order language based on a so-called
                 manifest contract system. In manifest contract systems,
                 contracts are part of static types and contract
                 verification is hybrid in the sense that some contracts
                 are statically verified, typically by subtyping, but
                 others are dynamically by casts. It is, however, not
                 trivial to extend existing manifest contract systems,
                 which have been designed mostly for pure functional
                 languages, to imperative features, mainly because of
                 the lack of flow-sensitivity, which should be taken
                 into account in verifying imperative programs
                 statically. We develop an imperative higher-order
                 manifest contract system $ \lambda_{\rm ref}^H $ for
                 flow-sensitive hybrid contract verification. We
                 introduce a computational variant of Nanevski et al's
                 Hoare types, which are flow-sensitive types to
                 represent pre- and postconditions of impure
                 computation. Our Hoare types are computational in the
                 sense that pre- and postconditions are given by
                 Booleans in the same language as programs so that they
                 are dynamically verifiable. $ \lambda_{\rm ref}^H $
                 also supports refinement types as in existing manifest
                 contract systems to describe flow-insensitive,
                 state-independent contracts of pure computation. While
                 it is desirable that any --- possibly
                 state-manipulating --- predicate can be used in
                 contracts, abuse of stateful operations will break the
                 system. To control stateful operations in contracts, we
                 introduce a region-based effect system, which allows
                 contracts in refinement types and computational Hoare
                 types to manipulate states, as long as they are
                 observationally pure and read-only, respectively. We
                 show that dynamic contract checking in our calculus is
                 consistent with static typing in the sense that the
                 final result obtained without dynamic contract
                 violations satisfies contracts in its static type. It
                 in particular means that the state after stateful
                 computations satisfies their postconditions. As in some
                 of prior manifest contract systems, static contract
                 verification in this work is ``post facto,'' that is,
                 we first define our manifest contract system so that
                 all contracts are checked at run time, formalize
                 conditions when dynamic checks can be removed safely,
                 and show that programs with and without such removable
                 checks are contextually equivalent. We also apply the
                 idea of post facto verification to region-based local
                 reasoning, inspired by the frame rule of Separation
                 Logic.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{deAmorim:2017:SAM,
  author =       "Arthur Azevedo de Amorim and Marco Gaboardi and Justin
                 Hsu and Shin-ya Katsumata and Ikram Cherigui",
  title =        "A semantic account of metric preservation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "545--556",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009890",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Program sensitivity measures how robust a program is
                 to small changes in its input, and is a fundamental
                 notion in domains ranging from differential privacy to
                 cyber-physical systems. A natural way to formalize
                 program sensitivity is in terms of metrics on the input
                 and output spaces, requiring that an r -sensitive
                 function map inputs that are at distance d to outputs
                 that are at distance at most r {\^A}$ \cdot $ d.
                 Program sensitivity is thus an analogue of Lipschitz
                 continuity for programs. Reed and Pierce introduced
                 Fuzz, a functional language with a linear type system
                 that can express program sensitivity. They show
                 soundness operationally, in the form of a metric
                 preservation property. Inspired by their work, we study
                 program sensitivity and metric preservation from a
                 denotational point of view. In particular, we introduce
                 metric CPOs, a novel semantic structure for reasoning
                 about computation on metric spaces, by endowing CPOs
                 with a compatible notion of distance. This structure is
                 useful for reasoning about metric properties of
                 programs, and specifically about program sensitivity.
                 We demonstrate metric CPOs by giving a model for the
                 deterministic fragment of Fuzz.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Smolka:2017:CMS,
  author =       "Steffen Smolka and Praveen Kumar and Nate Foster and
                 Dexter Kozen and Alexandra Silva",
  title =        "{Cantor} meets {Scott}: semantic foundations for
                 probabilistic networks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "557--571",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009843",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "ProbNetKAT is a probabilistic extension of NetKAT with
                 a denotational semantics based on Markov kernels. The
                 language is expressive enough to generate continuous
                 distributions, which raises the question of how to
                 compute effectively in the language. This paper gives
                 an new characterization of ProbNetKAT's semantics using
                 domain theory, which provides the foundation needed to
                 build a practical implementation. We show how to use
                 the semantics to approximate the behavior of arbitrary
                 ProbNetKAT programs using distributions with finite
                 support. We develop a prototype implementation and show
                 how to use it to solve a variety of problems including
                 characterizing the expected congestion induced by
                 different routing schemes and reasoning
                 probabilistically about reachability in a network.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Subramanian:2017:GSF,
  author =       "Kausik Subramanian and Loris D'Antoni and Aditya
                 Akella",
  title =        "{Genesis}: synthesizing forwarding tables in
                 multi-tenant networks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "572--585",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009845",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Operators in multi-tenant cloud datacenters require
                 support for diverse and complex end-to-end policies,
                 such as, reachability, middlebox traversals, isolation,
                 traffic engineering, and network resource management.
                 We present Genesis, a datacenter network management
                 system which allows policies to be specified in a
                 declarative manner without explicitly programming the
                 network data plane. Genesis tackles the problem of
                 enforcing policies by synthesizing switch forwarding
                 tables. It uses the formal foundations of constraint
                 solving in combination with fast off-the-shelf SMT
                 solvers. To improve synthesis performance, Genesis
                 incorporates a novel search strategy that uses regular
                 expressions to specify properties that leverage the
                 structure of datacenter networks, and a
                 divide-and-conquer synthesis procedure which exploits
                 the structure of policy relationships. We have
                 prototyped Genesis, and conducted experiments with a
                 variety of workloads on real-world topologies to
                 demonstrate its performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Kopczynski:2017:LSS,
  author =       "Eryk Kopczy{\'n}ski and Szymon Toru{\'n}czyk",
  title =        "{LOIS}: syntax and semantics",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "586--598",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009876",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the semantics of an imperative programming
                 language called LOIS (Looping Over Infinite Sets),
                 which allows iterating through certain infinite sets,
                 in finite time. Our semantics intuitively correspond to
                 execution of infinitely many threads in parallel. This
                 allows to merge the power of abstract mathematical
                 constructions into imperative programming. Infinite
                 sets are internally represented using first order
                 formulas over some underlying logical structure, and
                 SMT solvers are employed to evaluate programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Feng:2017:CBSa,
  author =       "Yu Feng and Ruben Martins and Yuepeng Wang and Isil
                 Dillig and Thomas W. Reps",
  title =        "Component-based synthesis for complex {APIs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "599--612",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009851",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Component-based approaches to program synthesis
                 assemble programs from a database of existing
                 components, such as methods provided by an API. In this
                 paper, we present a novel type-directed algorithm for
                 component-based synthesis. The key novelty of our
                 approach is the use of a compact Petri-net
                 representation to model relationships between methods
                 in an API. Given a target method signature S, our
                 approach performs reachability analysis on the
                 underlying Petri-net model to identify sequences of
                 method calls that could be used to synthesize an
                 implementation of S. The programs synthesized by our
                 algorithm are guaranteed to type check and pass all
                 test cases provided by the user. We have implemented
                 this approach in a tool called SyPet, and used it to
                 successfully synthesize real-world programming tasks
                 extracted from on-line forums and existing code
                 repositories. We also compare SyPet with two
                 state-of-the-art synthesis tools, namely InSynth and
                 CodeHint, and demonstrate that SyPet can synthesize
                 more programs in less time. Finally, we compare our
                 approach with an alternative solution based on
                 hypergraphs and demonstrate its advantages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Moerman:2017:LNA,
  author =       "Joshua Moerman and Matteo Sammartino and Alexandra
                 Silva and Bartek Klin and Michal Szynwelski",
  title =        "Learning nominal automata",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "613--625",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009879",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present an Angluin-style algorithm to learn nominal
                 automata, which are acceptors of languages over
                 infinite (structured) alphabets. The abstract approach
                 we take allows us to seamlessly extend known variations
                 of the algorithm to this new setting. In particular we
                 can learn a subclass of nominal non-deterministic
                 automata. An implementation using a recently developed
                 Haskell library for nominal computation is provided for
                 preliminary experiments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Bouajjani:2017:VCC,
  author =       "Ahmed Bouajjani and Constantin Enea and Rachid
                 Guerraoui and Jad Hamza",
  title =        "On verifying causal consistency",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "626--638",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009888",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Causal consistency is one of the most adopted
                 consistency criteria for distributed implementations of
                 data structures. It ensures that operations are
                 executed at all sites according to their causal
                 precedence. We address the issue of verifying
                 automatically whether the executions of an
                 implementation of a data structure are causally
                 consistent. We consider two problems: (1) checking
                 whether one single execution is causally consistent,
                 which is relevant for developing testing and bug
                 finding algorithms, and (2) verifying whether all the
                 executions of an implementation are causally
                 consistent. We show that the first problem is
                 NP-complete. This holds even for the read-write memory
                 abstraction, which is a building block of many modern
                 distributed systems. Indeed, such systems often store
                 data in key-value stores, which are instances of the
                 read-write memory abstraction. Moreover, we prove that,
                 surprisingly, the second problem is undecidable, and
                 again this holds even for the read-write memory
                 abstraction. However, we show that for the read-write
                 memory abstraction, these negative results can be
                 circumvented if the implementations are data
                 independent, i.e., their behaviors do not depend on the
                 data values that are written or read at each moment,
                 which is a realistic assumption. We prove that for data
                 independent implementations, the problem of checking
                 the correctness of a single execution w.r.t. the
                 read-write memory abstraction is polynomial time.
                 Furthermore, we show that for such implementations the
                 set of non-causally consistent executions can be
                 represented by means of a finite number of register
                 automata. Using these machines as observers (in
                 parallel with the implementation) allows to reduce
                 polynomially the problem of checking causal consistency
                 to a state reachability problem. This reduction holds
                 regardless of the class of programs used for the
                 implementation, of the number of read-write variables,
                 and of the used data domain. It allows leveraging
                 existing techniques for assertion/reachability checking
                 to causal consistency verification. Moreover, for a
                 significant class of implementations, we derive from
                 this reduction the decidability of verifying causal
                 consistency w.r.t. the read-write memory abstraction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Srikanth:2017:CVU,
  author =       "Akhilesh Srikanth and Burak Sahin and William R.
                 Harris",
  title =        "Complexity verification using guided theorem
                 enumeration",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "639--652",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009864",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Determining if a given program satisfies a given bound
                 on the amount of resources that it may use is a
                 fundamental problem with critical practical
                 applications. Conventional automatic verifiers for
                 safety properties cannot be applied to address this
                 problem directly because such verifiers target
                 properties expressed in decidable theories; however,
                 many practical bounds are expressed in nonlinear
                 theories, which are undecidable. In this work, we
                 introduce an automatic verification algorithm, CAMPY,
                 that determines if a given program P satisfies a given
                 resource bound B, which may be expressed using
                 polynomial, exponential, and logarithmic terms. The key
                 technical contribution behind our verifier is an
                 interpolating theorem prover for non-linear theories
                 that lazily learns a sufficiently accurate
                 approximation of non-linear theories by selectively
                 grounding theorems of the nonlinear theory that are
                 relevant to proving that P satisfies B. To evaluate
                 CAMPY, we implemented it to target Java Virtual Machine
                 bytecode. We applied CAMPY to verify that over 20
                 solutions submitted for programming problems hosted on
                 popular online coding platforms satisfy or do not
                 satisfy expected complexity bounds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Dudenhefner:2017:ITC,
  author =       "Andrej Dudenhefner and Jakob Rehof",
  title =        "Intersection type calculi of bounded dimension",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "653--665",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009862",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A notion of dimension in intersection typed \lambda
                 -calculi is presented. The dimension of a typed \lambda
                 -term is given by the minimal norm of an elaboration (a
                 proof theoretic decoration) necessary for typing the
                 term at its type, and, intuitively, measures
                 intersection introduction as a resource.
                 Bounded-dimensional intersection type calculi are shown
                 to enjoy subject reduction, since terms can be
                 elaborated in non-increasing norm under \beta
                 -reduction. We prove that a multiset interpretation
                 (corresponding to a non-idempotent and non-linear
                 interpretation of intersection) of dimensionality
                 corresponds to the number of simultaneous constraints
                 required during search for inhabitants. As a
                 consequence, the inhabitation problem is decidable in
                 bounded multiset dimension, and it is proven to be
                 EXPSPACE-complete. This result is a substantial
                 generalization of inhabitation for the rank 2-fragment,
                 yielding a calculus with decidable inhabitation which
                 is independent of rank. Our results give rise to a new
                 criterion (dimensional bound) for subclasses of
                 intersection type calculi with a decidable inhabitation
                 problem, which is orthogonal to previously known
                 criteria, and which should have immediate applications
                 in synthesis. Additionally, we give examples of
                 dimensional analysis of fragments of the intersection
                 type system, including conservativity over simple
                 types, rank 2-types, and normal form typings, and we
                 provide some observations towards dimensional analysis
                 of other systems. It is suggested (for future work)
                 that our notion of dimension may have semantic
                 interpretations in terms of reduction complexity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Amin:2017:TSP,
  author =       "Nada Amin and Tiark Rompf",
  title =        "Type soundness proofs with definitional interpreters",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "666--679",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009866",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "While type soundness proofs are taught in every
                 graduate PL class, the gap between realistic languages
                 and what is accessible to formal proofs is large. In
                 the case of Scala, it has been shown that its formal
                 model, the Dependent Object Types (DOT) calculus,
                 cannot simultaneously support key metatheoretic
                 properties such as environment narrowing and subtyping
                 transitivity, which are usually required for a type
                 soundness proof. Moreover, Scala and many other
                 realistic languages lack a general substitution
                 property. The first contribution of this paper is to
                 demonstrate how type soundness proofs for advanced,
                 polymorphic, type systems can be carried out with an
                 operational semantics based on high-level, definitional
                 interpreters, implemented in Coq. We present the first
                 mechanized soundness proofs in this style for System F
                 and several extensions, including mutable references.
                 Our proofs use only straightforward induction, which is
                 significant, as the combination of big-step semantics,
                 mutable references, and polymorphism is commonly
                 believed to require coinductive proof techniques. The
                 second main contribution of this paper is to show how
                 DOT-like calculi emerge from straightforward
                 generalizations of the operational aspects of F,
                 exposing a rich design space of calculi with
                 path-dependent types inbetween System F and DOT, which
                 we dub the System D Square. By working directly on the
                 target language, definitional interpreters can focus
                 the design space and expose the invariants that
                 actually matter at runtime. Looking at such runtime
                 invariants is an exciting new avenue for type system
                 design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Angiuli:2017:CHD,
  author =       "Carlo Angiuli and Robert Harper and Todd Wilson",
  title =        "Computational higher-dimensional type theory",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "680--693",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009861",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Formal constructive type theory has proved to be an
                 effective language for mechanized proof. By avoiding
                 non-constructive principles, such as the law of the
                 excluded middle, type theory admits sharper proofs and
                 broader interpretations of results. From a computer
                 science perspective, interest in type theory arises
                 from its applications to programming languages.
                 Standard constructive type theories used in
                 mechanization admit computational interpretations based
                 on meta-mathematical normalization theorems. These
                 proofs are notoriously brittle; any change to the
                 theory potentially invalidates its computational
                 meaning. As a case in point, Voevodsky's univalence
                 axiom raises questions about the computational meaning
                 of proofs. We consider the question: Can
                 higher-dimensional type theory be construed as a
                 programming language? We answer this question
                 affirmatively by providing a direct, deterministic
                 operational interpretation for a representative
                 higher-dimensional dependent type theory with higher
                 inductive types and an instance of univalence. Rather
                 than being a formal type theory defined by rules, it is
                 instead a computational type theory in the sense of
                 Martin-L{\"o}f's meaning explanations and of the NuPRL
                 semantics. The definition of the type theory starts
                 with programs; types are specifications of program
                 behavior. The main result is a canonicity theorem
                 stating that closed programs of boolean type evaluate
                 to true or false.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Chang:2017:TSM,
  author =       "Stephen Chang and Alex Knauth and Ben Greenman",
  title =        "Type systems as macros",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "694--705",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009886",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Turnstile, a metalanguage for creating
                 typed embedded languages. To implement the type system,
                 programmers write type checking rules resembling
                 traditional judgment syntax. To implement the
                 semantics, they incorporate elaborations into these
                 rules. Turnstile critically depends on the idea of
                 linguistic reuse. It exploits a macro system in a novel
                 way to simultaneously type check and rewrite a surface
                 program into a target language. Reusing a macro system
                 also yields modular implementations whose rules may be
                 mixed and matched to create other languages. Combined
                 with typical compiler and runtime reuse, Turnstile
                 produces performant typed embedded languages with
                 little effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Kumar:2017:PFA,
  author =       "Ananya Kumar and Guy E. Blelloch and Robert Harper",
  title =        "Parallel functional arrays",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "706--718",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009869",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The goal of this paper is to develop a form of
                 functional arrays (sequences) that are as efficient as
                 imperative arrays, can be used in parallel, and have
                 well defined cost-semantics. The key idea is to
                 consider sequences with functional value semantics but
                 non-functional cost semantics. Because the value
                 semantics is functional, ``updating'' a sequence
                 returns a new sequence. We allow operations on
                 ``older'' sequences (called interior sequences) to be
                 more expensive than operations on the ``most recent''
                 sequences (called leaf sequences). We embed sequences
                 in a language supporting fork-join parallelism. Due to
                 the parallelism, operations can be interleaved
                 non-deterministically, and, in conjunction with the
                 different cost for interior and leaf sequences, this
                 can lead to non-deterministic costs for a program.
                 Consequently the costs of programs can be difficult to
                 analyze. The main result is the derivation of a
                 deterministic cost dynamics which makes analyzing the
                 costs easier. The theorems are not specific to
                 sequences and can be applied to other data types with
                 different costs for operating on interior and leaf
                 versions. We present a wait-free concurrent
                 implementation of sequences that requires constant work
                 for accessing and updating leaf sequences, and
                 logarithmic work for accessing and linear work for
                 updating interior sequences. We sketch a proof of
                 correctness for the sequence implementation. The key
                 advantages of the present approach compared to current
                 approaches is that our implementation requires no
                 changes to existing programming languages, supports
                 nested parallelism, and has well defined cost
                 semantics. At the same time, it allows for functional
                 implementations of algorithms such as depth-first
                 search with the same asymptotic complexity as
                 imperative implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Konnov:2017:SCP,
  author =       "Igor Konnov and Marijana Lazi{\'c} and Helmut Veith
                 and Josef Widder",
  title =        "A short counterexample property for safety and
                 liveness verification of fault-tolerant distributed
                 algorithms",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "719--734",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009860",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Distributed algorithms have many mission-critical
                 applications ranging from embedded systems and
                 replicated databases to cloud computing. Due to
                 asynchronous communication, process faults, or network
                 failures, these algorithms are difficult to design and
                 verify. Many algorithms achieve fault tolerance by
                 using threshold guards that, for instance, ensure that
                 a process waits until it has received an acknowledgment
                 from a majority of its peers. Consequently,
                 domain-specific languages for fault-tolerant
                 distributed systems offer language support for
                 threshold guards. We introduce an automated method for
                 model checking of safety and liveness of
                 threshold-guarded distributed algorithms in systems
                 where the number of processes and the fraction of
                 faulty processes are parameters. Our method is based on
                 a short counterexample property: if a distributed
                 algorithm violates a temporal specification (in a
                 fragment of LTL), then there is a counterexample whose
                 length is bounded and independent of the parameters. We
                 prove this property by (i) characterizing executions
                 depending on the structure of the temporal formula, and
                 (ii) using commutativity of transitions to accelerate
                 and shorten executions. We extended the ByMC toolset
                 (Byzantine Model Checker) with our technique, and
                 verified liveness and safety of 10 prominent
                 fault-tolerant distributed algorithms, most of which
                 were out of reach for existing techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Liu:2017:ADB,
  author =       "Xinxin Liu and Tingting Yu and Wenhui Zhang",
  title =        "Analyzing divergence in bisimulation semantics",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "735--747",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009870",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Some bisimulation based abstract equivalence relations
                 may equate divergent systems with non-divergent ones,
                 examples including weak bisimulation equivalence and
                 branching bisimulation equivalence. Thus extra efforts
                 are needed to analyze divergence for the compared
                 systems. In this paper we propose a new method for
                 analyzing divergence in bisimulation semantics, which
                 relies only on simple observations of individual
                 transitions. We show that this method can verify
                 several typical divergence preserving bisimulation
                 equivalences including two well-known ones. As an
                 application case study, we use the proposed method to
                 verify the HSY collision stack to draw the conclusion
                 that the stack implementation is correct in terms of
                 linearizability with lock-free progress condition.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Lange:2017:FGL,
  author =       "Julien Lange and Nicholas Ng and Bernardo Toninho and
                 Nobuko Yoshida",
  title =        "Fencing off {Go}: liveness and safety for
                 channel-based programming",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "748--761",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009847",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Go is a production-level statically typed programming
                 language whose design features explicit message-passing
                 primitives and lightweight threads, enabling (and
                 encouraging) programmers to develop concurrent systems
                 where components interact through communication more so
                 than by lock-based shared memory concurrency. Go can
                 only detect global deadlocks at runtime, but provides
                 no compile-time protection against all too common
                 communication mismatches or partial deadlocks. This
                 work develops a static verification framework for
                 bounded liveness and safety in Go programs, able to
                 detect communication errors and partial deadlocks in a
                 general class of realistic concurrent programs,
                 including those with dynamic channel creation and
                 infinite recursion. Our approach infers from a Go
                 program a faithful representation of its communication
                 patterns as a behavioural type. By checking a syntactic
                 restriction on channel usage, dubbed fencing, we ensure
                 that programs are made up of finitely many different
                 communication patterns that may be repeated infinitely
                 many times. This restriction allows us to implement
                 bounded verification procedures (akin to bounded model
                 checking) to check for liveness and safety in types
                 which in turn approximates liveness and safety in Go
                 programs. We have implemented a type inference and
                 liveness and safety checks in a tool-chain and tested
                 it against publicly available Go programs. Updated on
                 27th Feb 2017. See Comments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Vitousek:2017:BTL,
  author =       "Michael M. Vitousek and Cameron Swords and Jeremy G.
                 Siek",
  title =        "Big types in little runtime: open-world soundness and
                 collaborative blame for gradual type systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "762--774",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009849",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Gradual typing combines static and dynamic typing in
                 the same language, offering programmers the error
                 detection and strong guarantees of static types and the
                 rapid prototyping and flexible programming idioms of
                 dynamic types. Many gradually typed languages are
                 implemented by translation into an untyped target
                 language (e.g., Typed Clojure, TypeScript, Gradualtalk,
                 and Reticulated Python). For such languages, it is
                 desirable to support arbitrary interaction between
                 translated code and legacy code in the untyped language
                 while maintaining the type soundness of the translated
                 code. In this paper we formalize this goal in the form
                 of the open-world soundness criterion. We discuss why
                 it is challenging to achieve open-world soundness using
                 the traditional proxy-based approach for higher-order
                 casts. However, the transient design satisfies
                 open-world soundness. Indeed, we present a formal
                 semantics for the transient design and prove that our
                 semantics satisfies open-world soundness. In this paper
                 we also solve a challenging problem for the transient
                 design: how to provide blame tracking without proxies.
                 We define a semantics for blame and prove the Blame
                 Theorem. We also prove that the Gradual Guarantee holds
                 for this system, ensuring that programs can be evolved
                 freely between static and dynamic typing. Finally, we
                 demonstrate that the runtime overhead of the transient
                 approach is low in the context of Reticulated Python,
                 an implementation of gradual typing for Python.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Lehmann:2017:GRT,
  author =       "Nico Lehmann and {\'E}ric Tanter",
  title =        "Gradual refinement types",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "775--788",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009856",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Refinement types are an effective language-based
                 verification technique. However, as any expressive
                 typing discipline, its strength is its weakness,
                 imposing sometimes undesired rigidity. Guided by
                 abstract interpretation, we extend the gradual typing
                 agenda and develop the notion of gradual refinement
                 types, allowing smooth evolution and interoperability
                 between simple types and logically-refined types. In
                 doing so, we address two challenges unexplored in the
                 gradual typing literature: dealing with imprecise
                 logical information, and with dependent function types.
                 The first challenge leads to a crucial notion of
                 locality for refinement formulas, and the second yields
                 novel operators related to type- and term-level
                 substitution, identifying new opportunity for runtime
                 errors in gradual dependently-typed languages. The
                 gradual language we present is type safe, type sound,
                 and satisfies the refined criteria for gradually-typed
                 languages of Siek et al. We also explain how to extend
                 our approach to richer refinement logics, anticipating
                 key challenges to consider.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Cimini:2017:AGD,
  author =       "Matteo Cimini and Jeremy G. Siek",
  title =        "Automatically generating the dynamic semantics of
                 gradually typed languages",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "789--803",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009863",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many language designers have adopted gradual typing.
                 However, there remains open questions regarding how to
                 gradualize languages. Cimini and Siek (2016) created a
                 methodology and algorithm to automatically generate the
                 type system of a gradually typed language from a fully
                 static version of the language. In this paper, we
                 address the next challenge of how to automatically
                 generate the dynamic semantics of gradually typed
                 languages. Such languages typically use an intermediate
                 language with explicit casts. Our first result is a
                 methodology for generating the syntax, type system, and
                 dynamic semantics of the intermediate language with
                 casts. Next, we present an algorithm that formalizes
                 and automates the methodology, given a language
                 definition as input. We show that our approach is
                 general enough to automatically gradualize several
                 languages, including features such as polymorphism,
                 recursive types and exceptions. We prove that our
                 algorithm produces languages that satisfy the key
                 correctness criteria of gradual typing. Finally, we
                 implement the algorithm, generating complete
                 specifications of gradually typed languages in
                 lambda-Prolog, including executable interpreters.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Jafery:2017:SUR,
  author =       "Khurram A. Jafery and Joshua Dunfield",
  title =        "Sums of uncertainty: refinements go gradual",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "804--817",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009865",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A long-standing shortcoming of statically typed
                 functional languages is that type checking does not
                 rule out pattern-matching failures (run-time match
                 exceptions). Refinement types distinguish different
                 values of datatypes; if a program annotated with
                 refinements passes type checking, pattern-matching
                 failures become impossible. Unfortunately, refinement
                 is a monolithic property of a type, exacerbating the
                 difficulty of adding refinement types to nontrivial
                 programs. Gradual typing has explored how to
                 incrementally move between static typing and dynamic
                 typing. We develop a type system of gradual sums that
                 combines refinement with imprecision. Then, we develop
                 a bidirectional version of the type system, which rules
                 out excessive imprecision, and give a type-directed
                 translation to a target language with explicit casts.
                 We prove that the static sublanguage cannot have match
                 failures, that a well-typed program remains well-typed
                 if its type annotations are made less precise, and that
                 making annotations less precise causes target programs
                 to fail later. Several of these results correspond to
                 criteria for gradual typing given by Siek et al.
                 (2015).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Ying:2017:IQP,
  author =       "Mingsheng Ying and Shenggang Ying and Xiaodi Wu",
  title =        "Invariants of quantum programs: characterisations and
                 generation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "818--832",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009840",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Program invariant is a fundamental notion widely used
                 in program verification and analysis. The aim of this
                 paper is twofold: (i) find an appropriate definition of
                 invariants for quantum programs; and (ii) develop an
                 effective technique of invariant generation for
                 verification and analysis of quantum programs.
                 Interestingly, the notion of invariant can be defined
                 for quantum programs in two different ways --- additive
                 invariants and multiplicative invariants ---
                 corresponding to two interpretations of implication in
                 a continuous valued logic: the Lukasiewicz implication
                 and the Godel implication. It is shown that both of
                 them can be used to establish partial correctness of
                 quantum programs. The problem of generating additive
                 invariants of quantum programs is addressed by reducing
                 it to an SDP (Semidefinite Programming) problem. This
                 approach is applied with an SDP solver to generate
                 invariants of two important quantum algorithms ---
                 quantum walk and quantum Metropolis sampling. Our
                 examples show that the generated invariants can be used
                 to verify correctness of these algorithms and are
                 helpful in optimising quantum Metropolis sampling. To
                 our knowledge, this paper is the first attempt to
                 define the notion of invariant and to develop a method
                 of invariant generation for quantum programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{DalLago:2017:GPC,
  author =       "Ugo {Dal Lago} and Claudia Faggian and Beno{\^\i}t
                 Valiron and Akira Yoshimizu",
  title =        "The geometry of parallelism: classical, probabilistic,
                 and quantum effects",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "833--845",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009859",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a Geometry of Interaction model for
                 higher-order quantum computation, and prove its
                 adequacy for a fully fledged quantum programming
                 language in which entanglement, duplication, and
                 recursion are all available. This model is an instance
                 of a new framework which captures not only quantum but
                 also classical and probabilistic computation. Its main
                 feature is the ability to model commutative effects in
                 a parallel setting. Our model comes with a multi-token
                 machine, a proof net system, and a -style language.
                 Being based on a multi-token machine equipped with a
                 memory, it has a concrete nature which makes it well
                 suited for building low-level operational descriptions
                 of higher-order languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Paykin:2017:QCL,
  author =       "Jennifer Paykin and Robert Rand and Steve Zdancewic",
  title =        "{QWIRE}: a core language for quantum circuits",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "846--858",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009894",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces QWIRE (``choir''), a language
                 for defining quantum circuits and an interface for
                 manipulating them inside of an arbitrary classical host
                 language. QWIRE is minimal---it contains only a few
                 primitives---and sound with respect to the physical
                 properties entailed by quantum mechanics. At the same
                 time, QWIRE is expressive and highly modular due to its
                 relationship with the host language, mirroring the QRAM
                 model of computation that places a quantum computer
                 (controlled by circuits) alongside a classical computer
                 (controlled by the host language). We present QWIRE
                 along with its type system and operational semantics,
                 which we prove is safe and strongly normalizing
                 whenever the host language is. We give circuits a
                 denotational semantics in terms of density matrices.
                 Throughout, we investigate examples that demonstrate
                 the expressive power of QWIRE, including extensions to
                 the host language that (1) expose a general analysis
                 framework for circuits, and (2) provide dependent
                 types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Amin:2017:LVA,
  author =       "Nada Amin and Tiark Rompf",
  title =        "{LMS-Verify}: abstraction without regret for verified
                 systems programming",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "859--873",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009867",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Performance critical software is almost always
                 developed in C, as programmers do not trust high-level
                 languages to deliver the same reliable performance.
                 This is bad because low-level code in unsafe languages
                 attracts security vulnerabilities and because
                 development is far less productive, with PL advances
                 mostly lost on programmers operating under tight
                 performance constraints. High-level languages provide
                 memory safety out of the box, but they are deemed too
                 slow and unpredictable for serious system software.
                 Recent years have seen a surge in staging and
                 generative programming: the key idea is to use
                 high-level languages and their abstraction power as
                 glorified macro systems to compose code fragments in
                 first-order, potentially domain-specific, intermediate
                 languages, from which fast C can be emitted. But what
                 about security? Since the end result is still C code,
                 the safety guarantees of the high-level host language
                 are lost. In this paper, we extend this generative
                 approach to emit ACSL specifications along with C code.
                 We demonstrate that staging achieves ``abstraction
                 without regret'' for verification: we show how
                 high-level programming models, in particular
                 higher-order composable contracts from dynamic
                 languages, can be used at generation time to compose
                 and generate first-order specifications that can be
                 statically checked by existing tools. We also show how
                 type classes can automatically attach invariants to
                 data types, reducing the need for repetitive manual
                 annotations. We evaluate our system on several case
                 studies that varyingly exercise verification of memory
                 safety, overflow safety, and functional correctness. We
                 feature an HTTP parser that is (1) fast (2) high-level:
                 implemented using staged parser combinators (3) secure:
                 with verified memory safety. This result is
                 significant, as input parsing is a key attack vector,
                 and vulnerabilities related to HTTP parsing have been
                 documented in all widely-used web servers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Assaf:2017:HSA,
  author =       "Mounir Assaf and David A. Naumann and Julien Signoles
                 and {\'E}ric Totel and Fr{\'e}d{\'e}ric Tronel",
  title =        "Hypercollecting semantics and its application to
                 static analysis of information flow",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "874--887",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009889",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We show how static analysis for secure information
                 flow can be expressed and proved correct entirely
                 within the framework of abstract interpretation. The
                 key idea is to define a Galois connection that directly
                 approximates the hyperproperty of interest. To enable
                 use of such Galois connections, we introduce a fixpoint
                 characterisation of hypercollecting semantics, i.e. a
                 ``set of sets'' transformer. This makes it possible to
                 systematically derive static analyses for
                 hyperproperties entirely within the calculational
                 framework of abstract interpretation. We evaluate this
                 technique by deriving example static analyses. For
                 qualitative information flow, we derive a dependence
                 analysis similar to the logic of Amtoft and Banerjee
                 (SAS'04) and the type system of Hunt and Sands
                 (POPL'06). For quantitative information flow, we derive
                 a novel cardinality analysis that bounds the leakage
                 conveyed by a program instead of simply deciding
                 whether it exists. This encompasses problems that are
                 hypersafety but not k -safety. We put the framework to
                 use and introduce variations that achieve precision
                 rivalling the most recent and precise static analyses
                 for information flow.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Zhang:2017:LTA,
  author =       "Danfeng Zhang and Daniel Kifer",
  title =        "{LightDP}: towards automating differential privacy
                 proofs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "1",
  pages =        "888--901",
  month =        jan,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093333.3009884",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The growing popularity and adoption of differential
                 privacy in academic and industrial settings has
                 resulted in the development of increasingly
                 sophisticated algorithms for releasing information
                 while preserving privacy. Accompanying this phenomenon
                 is the natural rise in the development and publication
                 of incorrect algorithms, thus demonstrating the
                 necessity of formal verification tools. However,
                 existing formal methods for differential privacy face a
                 dilemma: methods based on customized logics can verify
                 sophisticated algorithms but come with a steep learning
                 curve and significant annotation burden on the
                 programmers, while existing programming platforms lack
                 expressive power for some sophisticated algorithms. In
                 this paper, we present LightDP, a simple imperative
                 language that strikes a better balance between
                 expressive power and usability. The core of LightDP is
                 a novel relational type system that separates
                 relational reasoning from privacy budget calculations.
                 With dependent types, the type system is powerful
                 enough to verify sophisticated algorithms where the
                 composition theorem falls short. In addition, the
                 inference engine of LightDP infers most of the proof
                 details, and even searches for the proof with minimal
                 privacy cost when multiple proofs exist. We show that
                 LightDP verifies sophisticated algorithms with little
                 manual effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "POPL '17 conference proceedings.",
}

@Article{Tallada:2016:CGP,
  author =       "Marc Gonzalez Tallada",
  title =        "Coarse grain parallelization of deep neural networks",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851158",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Deep neural networks (DNN) have recently achieved
                 extraordinary results in domains like computer vision
                 and speech recognition. An essential element for this
                 success has been the introduction of high performance
                 computing (HPC) techniques in the critical step of
                 training the neural network. This paper describes the
                 implementation and analysis of a network-agnostic and
                 convergence-invariant coarse-grain parallelization of
                 the DNN training algorithm. The coarse-grain
                 parallelization is achieved through the exploitation of
                 the batch-level parallelism. This strategy is
                 independent from the support of specialized and
                 optimized libraries. Therefore, the optimization is
                 immediately available for accelerating the DNN
                 training. The proposal is compatible with multi-GPU
                 execution without altering the algorithm convergence
                 rate. The parallelization has been implemented in
                 Caffe, a state-of-the-art DNN framework. The paper
                 describes the code transformations for the
                 parallelization and we also identify the limiting
                 performance factors of the approach. We show
                 competitive performance results for two
                 state-of-the-art computer vision datasets, MNIST and
                 CIFAR-10. In particular, on a 16-core Xeon E5-2667v2 at
                 3.30GHz we observe speedups of 8$ \times $ over the
                 sequential execution, at similar performance levels of
                 those obtained by the GPU optimized Caffe version in a
                 NVIDIA K40 GPU.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Wang:2016:HPM,
  author =       "Xiao Wang and Amit Sabne and Sherman Kisner and Anand
                 Raghunathan and Charles Bouman and Samuel Midkiff",
  title =        "High performance model based image reconstruction",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851163",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computed Tomography (CT) Image Reconstruction is an
                 important technique used in a wide range of
                 applications, ranging from explosive detection, medical
                 imaging to scientific imaging. Among available
                 reconstruction methods, Model Based Iterative
                 Reconstruction (MBIR) produces higher quality images
                 and allows for the use of more general CT scanner
                 geometries than is possible with more commonly used
                 methods. The high computational cost of MBIR, however,
                 often makes it impractical in applications for which it
                 would otherwise be ideal. This paper describes a new
                 MBIR implementation that significantly reduces the
                 computational cost of MBIR while retaining its
                 benefits. It describes a novel organization of the
                 scanner data into super-voxels (SV) that, combined with
                 a super-voxel buffer (SVB), dramatically increase
                 locality and prefetching, enable parallelism across SVs
                 and lead to an average speedup of 187 on 20 cores.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Agrawal:2016:EAE,
  author =       "Sandeep R. Agrawal and Christopher M. Dee and Alvin R.
                 Lebeck",
  title =        "Exploiting accelerators for efficient high dimensional
                 similarity search",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "3:1--3:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851144",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Similarity search finds the most similar matches in an
                 object collection for a given query; making it an
                 important problem across a wide range of disciplines
                 such as web search, image recognition and protein
                 sequencing. Practical implementations of High
                 Dimensional Similarity Search (HDSS) search across
                 billions of possible solutions for multiple queries in
                 real time, making its performance and efficiency a
                 significant challenge. Existing clusters and
                 datacenters use commercial multicore hardware to
                 perform search, which may not provide the optimal
                 performance and performance per Watt. This work
                 explores the performance, power and cost benefits of
                 using throughput accelerators like GPUs to perform
                 similarity search for query cohorts even under tight
                 deadlines. We propose optimized implementations of
                 similarity search for both the host and the
                 accelerator. Augmenting existing Xeon servers with
                 accelerators results in a 3$ \times $ improvement in
                 throughput per machine, resulting in a more than 2.5$
                 \times $ reduction in cost of ownership, even for
                 discounted Xeon servers. Replacing a Xeon based cluster
                 with an accelerator based cluster for similarity search
                 reduces the total cost of ownership by more than 6$
                 \times $ to 16$ \times $ while consuming significantly
                 less power than an ARM based cluster.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Cruz:2016:DCG,
  author =       "Flavio Cruz and Ricardo Rocha and Seth Copen
                 Goldstein",
  title =        "Declarative coordination of graph-based parallel
                 programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851153",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Declarative programming has been hailed as a promising
                 approach to parallel programming since it makes it
                 easier to reason about programs while hiding the
                 implementation details of parallelism from the
                 programmer. However, its advantage is also its
                 disadvantage as it leaves the programmer with no
                 straightforward way to optimize programs for
                 performance. In this paper, we introduce Coordinated
                 Linear Meld (CLM), a concurrent forward-chaining linear
                 logic programming language, with a declarative way to
                 coordinate the execution of parallel programs allowing
                 the programmer to specify arbitrary scheduling and data
                 partitioning policies. Our approach allows the
                 programmer to write graph-based declarative programs
                 and then optionally to use coordination to fine-tune
                 parallel performance. In this paper we specify the set
                 of coordination facts, discuss their implementation in
                 a parallel virtual machine, and show---through
                 example---how they can be used to optimize parallel
                 execution. We compare the performance of CLM programs
                 against the original uncoordinated Linear Meld and
                 several other frameworks.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Denniston:2016:DH,
  author =       "Tyler Denniston and Shoaib Kamil and Saman
                 Amarasinghe",
  title =        "Distributed {Halide}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851157",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many image processing tasks are naturally expressed as
                 a pipeline of small computational kernels known as
                 stencils. Halide is a popular domain-specific language
                 and compiler designed to implement image processing
                 algorithms. Halide uses simple language constructs to
                 express what to compute and a separate scheduling
                 co-language for expressing when and where to perform
                 the computation. This approach has demonstrated
                 performance comparable to or better than hand-optimized
                 code. Until now, however, Halide has been restricted to
                 parallel shared memory execution, limiting its
                 performance for memory-bandwidth-bound pipelines or
                 large-scale image processing tasks. We present an
                 extension to Halide to support distributed-memory
                 parallel execution of complex stencil pipelines. These
                 extensions compose with the existing scheduling
                 constructs in Halide, allowing expression of complex
                 computation and communication strategies. Existing
                 Halide applications can be distributed with minimal
                 changes, allowing programmers to explore the tradeoff
                 between recomputation and communication with little
                 effort. Approximately 10 new of lines code are needed
                 even for a 200 line, 99 stage application. On nine
                 image processing benchmarks, our extensions give up to
                 a 1.4$ \times $ speedup on a single node over regular
                 multithreaded execution with the same number of cores,
                 by mitigating the effects of non-uniform memory access.
                 The distributed benchmarks achieve up to 18$ \times $
                 speedup on a 16 node testing machine and up to 57$
                 \times $ speedup on 64 nodes of the NERSC Cori
                 supercomputer.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Newton:2016:PTC,
  author =       "Ryan R. Newton and {\"O}mer S. Agacan and Peter Fogg
                 and Sam Tobin-Hochstadt",
  title =        "Parallel type-checking with {Haskell} using saturating
                 {LVars} and stream generators",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851142",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Given the sophistication of recent type systems,
                 unification-based type-checking and inference can be a
                 time-consuming phase of compilation---especially when
                 union types are combined with subtyping. It is natural
                 to consider improving performance through parallelism,
                 but these algorithms are challenging to parallelize due
                 to complicated control structure and difficulties
                 representing data in a way that is both efficient and
                 supports concurrency. We provide techniques that
                 address these problems based on the LVish approach to
                 deterministic-by-default parallel programming. We
                 extend LVish with Saturating LVars, the first LVars
                 implemented to release memory during the object's
                 lifetime. Our design allows us to achieve a parallel
                 speedup on worst-case (exponential) inputs of
                 Hindley-Milner inference, and on the Typed Racket
                 type-checking algorithm, which yields up an 8.46$
                 \times $ parallel speedup on 14 cores for type-checking
                 examples drawn from the Racket repository.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Wang:2016:APG,
  author =       "Lei Wang and Fan Yang and Liangji Zhuang and Huimin
                 Cui and Fang Lv and Xiaobing Feng",
  title =        "Articulation points guided redundancy elimination for
                 betweenness centrality",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851154",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Betweenness centrality (BC) is an important metrics in
                 graph analysis which indicates critical vertices in
                 large-scale networks based on shortest path
                 enumeration. Typically, a BC algorithm constructs a
                 shortest-path DAG for each vertex to calculate its BC
                 score. However, for emerging real-world graphs, even
                 the state-of-the-art BC algorithm will introduce a
                 number of redundancies, as suggested by the existence
                 of articulation points. Articulation points imply some
                 common sub-DAGs in the DAGs for different vertices, but
                 existing algorithms do not leverage such information
                 and miss the optimization opportunity. We propose a
                 redundancy elimination approach, which identifies the
                 common sub-DAGs shared between the DAGs for different
                 vertices. Our approach leverages the articulation
                 points and reuses the results of the common sub-DAGs in
                 calculating the BC scores, which eliminates redundant
                 computations. We implemented the approach as an
                 algorithm with two-level parallelism and evaluated it
                 on a multicore platform. Compared to the
                 state-of-the-art implementation using shared memory,
                 our approach achieves an average speedup of 4.6x across
                 a variety of real-world graphs, with the traversal
                 rates up to 45 ~ 2400 MTEPS (Millions of Traversed
                 Edges per Second).",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Bloemen:2016:MCF,
  author =       "Vincent Bloemen and Alfons Laarman and Jaco van de
                 Pol",
  title =        "Multi-core on-the-fly {SCC} decomposition",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851161",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The main advantages of Tarjan's strongly connected
                 component (SCC) algorithm are its linear time
                 complexity and ability to return SCCs on-the-fly, while
                 traversing or even generating the graph. Until now,
                 most parallel SCC algorithms sacrifice both: they run
                 in quadratic worst-case time and/or require the full
                 graph in advance. The current paper presents a novel
                 parallel, on-the-fly SCC algorithm. It preserves the
                 linear-time property by letting workers explore the
                 graph randomly while carefully communicating partially
                 completed SCCs. We prove that this strategy is correct.
                 For efficiently communicating partial SCCs, we develop
                 a concurrent, iterable disjoint set structure
                 (combining the union-find data structure with a cyclic
                 list). We demonstrate scalability on a 64-core machine
                 using 75 real-world graphs (from model checking and
                 explicit data graphs), synthetic graphs (combinations
                 of trees, cycles and linear graphs), and random graphs.
                 Previous work did not show speedups for graphs
                 containing a large SCC. We observe that our parallel
                 algorithm is typically 10-30$ \times $ faster compared
                 to Tarjan's algorithm for graphs containing a large
                 SCC. Comparable performance (with respect to the
                 current state-of-the-art) is obtained for graphs
                 containing many small SCCs.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Kannan:2016:HPP,
  author =       "Ramakrishnan Kannan and Grey Ballard and Haesun Park",
  title =        "A high-performance parallel algorithm for nonnegative
                 matrix factorization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851152",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Non-negative matrix factorization (NMF) is the problem
                 of determining two non-negative low rank factors W and
                 H, for the given input matrix A, such that A \approx
                 WH. NMF is a useful tool for many applications in
                 different domains such as topic modeling in text
                 mining, background separation in video analysis, and
                 community detection in social networks. Despite its
                 popularity in the data mining community, there is a
                 lack of efficient distributed algorithms to solve the
                 problem for big data sets. We propose a
                 high-performance distributed-memory parallel algorithm
                 that computes the factorization by iteratively solving
                 alternating non-negative least squares (NLS)
                 subproblems for W and H. It maintains the data and
                 factor matrices in memory (distributed across
                 processors), uses MPI for interprocessor communication,
                 and, in the dense case, provably minimizes
                 communication costs (under mild assumptions). As
                 opposed to previous implementations, our algorithm is
                 also flexible: (1) it performs well for both dense and
                 sparse matrices, and (2) it allows the user to choose
                 any one of the multiple algorithms for solving the
                 updates to low rank factors W and H within the
                 alternating iterations. We demonstrate the scalability
                 of our algorithm and compare it with baseline
                 implementations, showing significant performance
                 improvements.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Chowdhury:2016:AAD,
  author =       "Rezaul Chowdhury and Pramod Ganapathi and Jesmin Jahan
                 Tithi and Charles Bachmeier and Bradley C. Kuszmaul and
                 Charles E. Leiserson and Armando Solar-Lezama and Yuan
                 Tang",
  title =        "{AUTOGEN}: automatic discovery of cache-oblivious
                 parallel recursive algorithms for solving dynamic
                 programs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851167",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present AUTOGEN---an algorithm that for a wide
                 class of dynamic programming (DP) problems
                 automatically discovers highly efficient
                 cache-oblivious parallel recursive divide-and-conquer
                 algorithms from inefficient iterative descriptions of
                 DP recurrences. AUTOGEN analyzes the set of DP table
                 locations accessed by the iterative algorithm when run
                 on a DP table of small size, and automatically
                 identifies a recursive access pattern and a
                 corresponding provably correct recursive algorithm for
                 solving the DP recurrence. We use AUTOGEN to
                 autodiscover efficient algorithms for several
                 well-known problems. Our experimental results show that
                 several autodiscovered algorithms significantly
                 outperform parallel looping and tiled loop-based
                 algorithms. Also these algorithms are less sensitive to
                 fluctuations of memory and bandwidth compared with
                 their looping counterparts, and their running times and
                 energy profiles remain relatively more stable. To the
                 best of our knowledge, AUTOGEN is the first algorithm
                 that can automatically discover new nontrivial
                 divide-and-conquer algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Wang:2016:GHP,
  author =       "Yangzihao Wang and Andrew Davidson and Yuechao Pan and
                 Yuduo Wu and Andy Riffel and John D. Owens",
  title =        "{Gunrock}: a high-performance graph processing library
                 on the {GPU}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851145",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For large-scale graph analytics on the GPU, the
                 irregularity of data access/control flow and the
                 complexity of programming GPUs have been two
                 significant challenges for developing a programmable
                 high-performance graph library. ``Gunrock,'' our
                 high-level bulk-synchronous graph-processing system
                 targeting the GPU, takes a new approach to abstracting
                 GPU graph analytics: rather than designing an
                 abstraction around computation, Gunrock instead
                 implements a novel data-centric abstraction centered on
                 operations on a vertex or edge frontier. Gunrock
                 achieves a balance between performance and
                 expressiveness by coupling high-performance GPU
                 computing primitives and optimization strategies with a
                 high-level programming model that allows programmers to
                 quickly develop new graph primitives with small code
                 size and minimal GPU programming knowledge. We evaluate
                 Gunrock on five graph primitives (BFS, BC, SSSP, CC,
                 and PageRank) and show that Gunrock has on average at
                 least an order of magnitude speedup over Boost and
                 PowerGraph, comparable performance to the fastest GPU
                 hardwired primitives, and better performance than any
                 other GPU high-level graph library.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Ashkiani:2016:GM,
  author =       "Saman Ashkiani and Andrew Davidson and Ulrich Meyer
                 and John D. Owens",
  title =        "{GPU} multisplit",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851169",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Multisplit is a broadly useful parallel primitive that
                 permutes its input data into contiguous buckets or
                 bins, where the function that categorizes an element
                 into a bucket is provided by the programmer. Due to the
                 lack of an efficient multisplit on GPUs, programmers
                 often choose to implement multisplit with a sort.
                 However, sort does more work than necessary to
                 implement multisplit, and is thus inefficient. In this
                 work, we provide a parallel model and multiple
                 implementations for the multisplit problem. Our
                 principal focus is multisplit for a small number of
                 buckets. In our implementations, we exploit the
                 computational hierarchy of the GPU to perform most of
                 the work locally, with minimal usage of global
                 operations. We also use warp-synchronous programming
                 models to avoid branch divergence and reduce memory
                 usage, as well as hierarchical reordering of input
                 elements to achieve better coalescing of global memory
                 accesses. On an NVIDIA K40c GPU, for key-only
                 (key-value) multisplit, we demonstrate a 3.0-6.7x
                 (4.4-8.0x) speedup over radix sort, and achieve a peak
                 throughput of 10.0 G keys/s.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Matteis:2016:KCR,
  author =       "Tiziano {De Matteis} and Gabriele Mencagli",
  title =        "Keep calm and react with foresight: strategies for
                 low-latency and energy-efficient elastic data stream
                 processing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851148",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper addresses the problem of designing scaling
                 strategies for elastic data stream processing.
                 Elasticity allows applications to rapidly change their
                 configuration on-the-fly (e.g., the amount of used
                 resources) in response to dynamic workload
                 fluctuations. In this work we face this problem by
                 adopting the Model Predictive Control technique, a
                 control-theoretic method aimed at finding the optimal
                 application configuration along a limited prediction
                 horizon in the future by solving an online optimization
                 problem. Our control strategies are designed to address
                 latency constraints, using Queueing Theory models, and
                 energy consumption by changing the number of used cores
                 and the CPU frequency through the Dynamic Voltage and
                 Frequency Scaling (DVFS) support available in the
                 modern multicore CPUs. The proactive capabilities, in
                 addition to the latency- and energy-awareness,
                 represent the novel features of our approach. To
                 validate our methodology, we develop a thorough set of
                 experiments on a high-frequency trading application.
                 The results demonstrate the high-degree of flexibility
                 and configurability of our approach, and show the
                 effectiveness of our elastic scaling strategies
                 compared with existing state-of-the-art techniques used
                 in similar scenarios.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Li:2016:WSI,
  author =       "Jing Li and Kunal Agrawal and Sameh Elnikety and
                 Yuxiong He and I-Ting Angelina Lee and Chenyang Lu and
                 Kathryn S. McKinley",
  title =        "Work stealing for interactive services to meet target
                 latency",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851151",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Interactive web services increasingly drive critical
                 business workloads such as search, advertising, games,
                 shopping, and finance. Whereas optimizing parallel
                 programs and distributed server systems have
                 historically focused on average latency and throughput,
                 the primary metric for interactive applications is
                 instead consistent responsiveness, i.e., minimizing the
                 number of requests that miss a target latency. This
                 paper is the first to show how to generalize
                 work-stealing, which is traditionally used to minimize
                 the makespan of a single parallel job, to optimize for
                 a target latency in interactive services with multiple
                 parallel requests. We design a new adaptive work
                 stealing policy, called tail-control, that reduces the
                 number of requests that miss a target latency. It uses
                 instantaneous request progress, system load, and a
                 target latency to choose when to parallelize requests
                 with stealing, when to admit new requests, and when to
                 limit parallelism of large requests. We implement this
                 approach in the Intel Thread Building Block (TBB)
                 library and evaluate it on real-world workloads and
                 synthetic workloads. The tail-control policy
                 substantially reduces the number of requests exceeding
                 the desired target latency and delivers up to 58\%
                 relative improvement over various baseline policies.
                 This generalization of work stealing for multiple
                 requests effectively optimizes the number of requests
                 that complete within a target latency, a key metric for
                 interactive services.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Steele:2016:AAC,
  author =       "Guy L. {Steele, Jr.} and Jean-Baptiste Tristan",
  title =        "Adding approximate counters",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851147",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a general framework for adding the values
                 of two approximate counters to produce a new
                 approximate counter value whose expected estimated
                 value is equal to the sum of the expected estimated
                 values of the given approximate counters. (To the best
                 of our knowledge, this is the first published
                 description of any algorithm for adding two approximate
                 counters.) We then work out implementation details for
                 five different kinds of approximate counter and provide
                 optimized pseudocode. For three of them, we present
                 proofs that the variance of a counter value produced by
                 adding two counter values in this way is bounded, and
                 in fact is no worse, or not much worse, than the
                 variance of the value of a single counter to which the
                 same total number of increment operations have been
                 applied. Addition of approximate counters is useful in
                 massively parallel divide-and-conquer algorithms that
                 use a distributed representation for large arrays of
                 counters. We describe two machine-learning algorithms
                 for topic modeling that use millions of integer
                 counters, and confirm that replacing the integer
                 counters with approximate counters is effective,
                 speeding up a GPU-based implementation by over 65\% and
                 a CPU-based by nearly 50\%, as well as reducing memory
                 requirements, without degrading their statistical
                 effectiveness.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Yang:2016:WFQ,
  author =       "Chaoran Yang and John Mellor-Crummey",
  title =        "A wait-free queue as fast as fetch-and-add",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851168",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent data structures that have fast and
                 predictable performance are of critical importance for
                 harnessing the power of multicore processors, which are
                 now ubiquitous. Although wait-free objects, whose
                 operations complete in a bounded number of steps, were
                 devised more than two decades ago, wait-free objects
                 that can deliver scalable high performance are still
                 rare. In this paper, we present the first wait-free
                 FIFO queue based on fetch-and-add (FAA). While
                 compare-and-swap (CAS) based non-blocking algorithms
                 may perform poorly due to work wasted by CAS failures,
                 algorithms that coordinate using FAA, which is
                 guaranteed to succeed, can in principle perform better
                 under high contention. Along with FAA, our queue uses a
                 custom epoch-based scheme to reclaim memory; on x86
                 architectures, it requires no extra memory fences on
                 our algorithm's typical execution path. An empirical
                 study of our new FAA-based wait-free FIFO queue under
                 high contention on four different architectures with
                 many hardware threads shows that it outperforms prior
                 queue designs that lack a wait-free progress guarantee.
                 Surprisingly, at the highest level of contention, the
                 throughput of our queue is often as high as that of a
                 microbenchmark that only performs FAA. As a result, our
                 fast wait-free queue implementation is useful in
                 practice on most multi-core systems today. We believe
                 that our design can serve as an example of how to
                 construct other fast wait-free objects.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Haider:2016:LRA,
  author =       "Syed Kamran Haider and William Hasenplaugh and Dan
                 Alistarh",
  title =        "Lease\slash release: architectural support for scaling
                 contended data structures",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851155",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "High memory contention is generally agreed to be a
                 worst-case scenario for concurrent data structures.
                 There has been a significant amount of research effort
                 spent investigating designs which minimize contention,
                 and several programming techniques have been proposed
                 to mitigate its effects. However, there are currently
                 few architectural mechanisms to allow scaling contended
                 data structures at high thread counts. In this paper,
                 we investigate hardware support for scalable contended
                 data structures. We propose Lease/Release, a simple
                 addition to standard directory-based MSI cache
                 coherence protocols, allowing participants to lease
                 memory, at the granularity of cache lines, by delaying
                 coherence messages for a short, bounded period of time.
                 Our analysis shows that Lease/Release can significantly
                 reduce the overheads of contention for both
                 non-blocking (lock-free) and lock-based data structure
                 implementations, while ensuring that no deadlocks are
                 introduced. We validate Lease/Release empirically on
                 the Graphite multiprocessor simulator, on a range of
                 data structures, including queue, stack, and priority
                 queue implementations, as well as on transactional
                 applications. Results show that Lease/Release
                 consistently improves both throughput and energy usage,
                 by up to 5x, both for lock-free and lock-based data
                 structure designs.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Guerraoui:2016:OCO,
  author =       "Rachid Guerraoui and Vasileios Trigonakis",
  title =        "Optimistic concurrency with {OPTIK}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851146",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce OPTIK, a new practical design pattern for
                 designing and implementing fast and scalable concurrent
                 data structures. OPTIK relies on the commonly-used
                 technique of version numbers for detecting conflicting
                 concurrent operations. We show how to implement the
                 OPTIK pattern using the novel concept of OPTIK locks.
                 These locks enable the use of version numbers for
                 implementing very efficient optimistic concurrent data
                 structures. Existing state-of-the-art lock-based data
                 structures acquire the lock and then check for
                 conflicts. In contrast, with OPTIK locks, we merge the
                 lock acquisition with the detection of conflicting
                 concurrency in a single atomic step, similarly to
                 lock-free algorithms. We illustrate the power of our
                 OPTIK pattern and its implementation by introducing
                 four new algorithms and by optimizing four
                 state-of-the-art algorithms for linked lists, skip
                 lists, hash tables, and queues. Our results show that
                 concurrent data structures built using OPTIK are more
                 scalable than the state of the art.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Dice:2016:RTL,
  author =       "Dave Dice and Alex Kogan and Yossi Lev",
  title =        "Refined transactional lock elision",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851162",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactional lock elision (TLE) is a well-known
                 technique that exploits hardware transactional memory
                 (HTM) to introduce concurrency into lock-based
                 software. It achieves that by attempting to execute a
                 critical section protected by a lock in an atomic
                 hardware transaction, reverting to the lock if these
                 attempts fail. One significant drawback of TLE is that
                 it disables hardware speculation once there is a thread
                 running under lock. In this paper we present two
                 algorithms that rely on existing compiler support for
                 transactional programs and allow threads to speculate
                 concurrently on HTM along with a thread holding the
                 lock. We demonstrate the benefit of our algorithms over
                 TLE and other related approaches with an in-depth
                 analysis of a number of benchmarks and a wide range of
                 workloads, including an AVL tree-based micro-benchmark
                 and ccTSA, a real sequence assembler application.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Cao:2016:DBG,
  author =       "Man Cao and Minjia Zhang and Aritra Sengupta and
                 Michael D. Bond",
  title =        "Drinking from both glasses: combining pessimistic and
                 optimistic tracking of cross-thread dependences",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "20:1--20:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851143",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is notoriously challenging to develop parallel
                 software systems that are both scalable and correct.
                 Runtime support for parallelism---such as multithreaded
                 record {\&} replay, data race detectors, transactional
                 memory, and enforcement of stronger memory
                 models---helps achieve these goals, but existing
                 commodity solutions slow programs substantially in
                 order to track (i.e., detect or control) an execution's
                 cross-thread dependences accurately. Prior work tracks
                 cross-thread dependences either ``pessimistically,''
                 slowing every program access, or ``optimistically,''
                 allowing for lightweight instrumentation of most
                 accesses but dramatically slowing accesses involved in
                 cross-thread dependences. This paper seeks to hybridize
                 pessimistic and optimistic tracking, which is
                 challenging because there exists a fundamental mismatch
                 between pessimistic and optimistic tracking. We address
                 this challenge based on insights about how dependence
                 tracking and program synchronization interact, and
                 introduce a novel approach called hybrid tracking.
                 Hybrid tracking is suitable for building efficient
                 runtime support, which we demonstrate by building
                 hybrid-tracking-based versions of a dependence recorder
                 and a region serializability enforcer. An adaptive,
                 profile-based policy makes runtime decisions about
                 switching between pessimistic and optimistic tracking.
                 Our evaluation shows that hybrid tracking enables
                 runtime support to overcome the performance limitations
                 of both pessimistic and optimistic tracking alone.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Wang:2016:MGM,
  author =       "Tianzheng Wang and Milind Chabbi and Hideaki Kimura",
  title =        "Be my guest: {MCS} lock now welcomes guests",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851160",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The MCS lock is one of the most prevalent queuing
                 locks. It provides fair scheduling and high performance
                 on massively parallel systems. However, the MCS lock
                 mandates a bring-your-own-context policy: each lock
                 user must provide an additional context (i.e., a queue
                 node) to interact with the lock. This paper proposes
                 MCSg, a variant of the MCS lock that relaxes this
                 restriction. Our key observation is that not all lock
                 users are created equal. We analyzed how locks are used
                 in massively-parallel modern systems, such as
                 NUMA-aware operating systems and databases. We found
                 that such systems often have a small number of
                 ``regular'' code paths that enter the lock very
                 frequently. Such code paths are the primary beneficiary
                 of the high scalability of MCS locks. However, there
                 are also many ``guest'' code paths that infrequently
                 enter the lock and do not need the same degree of
                 fairness to access the lock (e.g., background tasks
                 that only run periodically with lower priority). These
                 guest users, which are typically spread out in various
                 modules of the software, prefer context-free locks,
                 such as ticket locks. MCSg provides these guests a
                 context-free interface while regular users still enjoy
                 the benefits provided by MCS. It can also be used as a
                 drop-in replacement of MCS for more advanced locks,
                 such as cohort locking. We also propose MCSg++, an
                 extended version of MCSg, which avoids guest starvation
                 and non-FIFO behaviors that might happen with MCSg. Our
                 evaluation using microbenchmarks and the TPC-C database
                 benchmark on a 16-socket, 240-core server shows that
                 both MCSg and MCSg++ preserve the benefits of MCS for
                 regular users while providing a context-free interface
                 for guests.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Chabbi:2016:CCL,
  author =       "Milind Chabbi and John Mellor-Crummey",
  title =        "Contention-conscious, locality-preserving locks",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851166",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the last decade, the growing use of
                 cache-coherent NUMA architectures has spurred the
                 development of numerous locality-preserving mutual
                 exclusion algorithms. NUMA-aware locks such as HCLH,
                 HMCS, and cohort locks exploit locality of reference
                 among nearby threads to deliver high lock throughput
                 under high contention. However, the hierarchical nature
                 of these locality-aware locks increases latency, which
                 reduces the throughput of uncontended or
                 lightly-contended critical sections. To date, no lock
                 design for NUMA systems has delivered both low latency
                 under low contention and high throughput under high
                 contention. In this paper, we describe the design and
                 evaluation of an adaptive mutual exclusion scheme
                 (AHMCS lock), which employs several orthogonal
                 strategies---a hierarchical MCS (HMCS) lock for high
                 throughput under high contention, Lamport's fast path
                 approach for low latency under low contention, an
                 adaptation mechanism that employs hysteresis to balance
                 latency and throughput under moderate contention, and
                 hardware transactional memory for lowest latency in the
                 absence of contention. The result is a top performing
                 lock that has most properties of an ideal mutual
                 exclusion algorithm. AHMCS exploits the strengths of
                 multiple contention management techniques to deliver
                 high performance over a broad range of contention
                 levels. Our empirical evaluations demonstrate the
                 effectiveness of AHMCS over prior art.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Kalikar:2016:DNM,
  author =       "Saurabh Kalikar and Rupesh Nasre",
  title =        "{DomLock}: a new multi-granularity locking technique
                 for hierarchies",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851164",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present efficient locking mechanisms for
                 hierarchical data structures. Several applications work
                 on an abstract hierarchy of objects, and a parallel
                 execution on this hierarchy necessitates
                 synchronization across workers operating on different
                 parts of the hierarchy. Existing synchronization
                 mechanisms are either too coarse, too inefficient, or
                 too ad hoc, resulting in reduced or unpredictable
                 amount of concurrency. We propose a new locking
                 approach based on the structural properties of the
                 underlying hierarchy. We show that the developed
                 techniques are efficient even when the hierarchy is an
                 arbitrary graph, and are applicable even when the
                 hierarchy involves mutation. Theoretically, we present
                 our approach as a locking-cost-minimizing instance of a
                 generic algebraic model of synchronization for
                 hierarchical data structures. Using STMBench7, we
                 illustrate considerable reduction in the locking cost,
                 resulting in an average throughput improvement of
                 42\%.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Ritson:2016:BWM,
  author =       "Carl G. Ritson and Scott Owens",
  title =        "Benchmarking weak memory models",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "24:1--24:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851150",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To achieve good multi-core performance, modern
                 microprocessors have weak memory models, rather than
                 enforce sequential consistency. This gives the
                 programmer a wide scope for choosing exactly how to
                 implement various aspects of inter-thread communication
                 through the system's shared memory. However, these
                 choices come with both semantic and performance
                 consequences, often in tension with each other. In this
                 paper, we focus on the performance side, and define
                 techniques for evaluating the impact of various choices
                 in using weak memory models, such as where to put
                 fences, and which fences to use. We make no attempt to
                 judge certain strategies as best or most efficient, and
                 instead provide the techniques that will allow the
                 programmer to understand the performance implications
                 when identifying and resolving any semantic/performance
                 trade-offs. In particular, our technique supports the
                 reasoned selection of macrobenchmarks to use in
                 investigating trade-offs in using weak memory models.
                 We demonstrate our technique on both synthetic
                 benchmarks and real-world applications for the Linux
                 Kernel and OpenJDK Hotspot Virtual Machine on the ARMv8
                 and POWERv7 architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Narayanaswamy:2016:VCA,
  author =       "Ganesh Narayanaswamy and Saurabh Joshi and Daniel
                 Kroening",
  title =        "The virtues of conflict: analysing modern
                 concurrency",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851165",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern shared memory multiprocessors permit reordering
                 of memory operations for performance reasons. These
                 reorderings are often a source of subtle bugs in
                 programs written for such architectures. Traditional
                 approaches to verify weak memory programs often rely on
                 interleaving semantics, which is prone to state space
                 explosion, and thus severely limits the scalability of
                 the analysis. In recent times, there has been a renewed
                 interest in modelling dynamic executions of weak memory
                 programs using partial orders. However, such an
                 approach typically requires ad-hoc mechanisms to
                 correctly capture the data and control-flow
                 choices/conflicts present in real-world programs. In
                 this work, we propose a novel, conflict-aware,
                 composable, truly concurrent semantics for programs
                 written using C/C++ for modern weak memory
                 architectures. We exploit our symbolic semantics based
                 on general event structures to build an efficient
                 decision procedure that detects assertion violations in
                 bounded multi-threaded programs. Using a large,
                 representative set of benchmarks, we show that our
                 conflict-aware semantics outperforms the
                 state-of-the-art partial-order based approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Perrin:2016:CCB,
  author =       "Matthieu Perrin and Achour Mostefaoui and Claude
                 Jard",
  title =        "Causal consistency: beyond memory",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "26:1--26:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851170",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In distributed systems where strong consistency is
                 costly when not impossible, causal consistency provides
                 a valuable abstraction to represent program executions
                 as partial orders. In addition to the sequential
                 program order of each computing entity, causal order
                 also contains the semantic links between the events
                 that affect the shared objects --- messages emission
                 and reception in a communication channel, reads and
                 writes on a shared register. Usual approaches based on
                 semantic links are very difficult to adapt to other
                 data types such as queues or counters because they
                 require a specific analysis of causal dependencies for
                 each data type. This paper presents a new approach to
                 define causal consistency for any abstract data type
                 based on sequential specifications. It explores,
                 formalizes and studies the differences between three
                 variations of causal consistency and highlights them in
                 the light of PRAM, eventual consistency and sequential
                 consistency: weak causal consistency, that captures the
                 notion of causality preservation when focusing on
                 convergence; causal convergence that mixes weak causal
                 consistency and convergence; and causal consistency,
                 that coincides with causal memory when applied to
                 shared memory.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Chatzopoulos:2016:EES,
  author =       "Georgios Chatzopoulos and Aleksandar Dragojevi{\'c}
                 and Rachid Guerraoui",
  title =        "{ESTIMA}: extrapolating scalability of in-memory
                 applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "27:1--27:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851159",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents ESTIMA, an easy-to-use tool for
                 extrapolating the scalability of in-memory
                 applications. ESTIMA is designed to perform a simple,
                 yet important task: given the performance of an
                 application on a small machine with a handful of cores,
                 ESTIMA extrapolates its scalability to a larger machine
                 with more cores, while requiring minimum input from the
                 user. The key idea underlying ESTIMA is the use of
                 stalled cycles (e.g. cycles that the processor spends
                 waiting for various events, such as cache misses or
                 waiting on a lock). ESTIMA measures stalled cycles on a
                 few cores and extrapolates them to more cores,
                 estimating the amount of waiting in the system. ESTIMA
                 can be effectively used to predict the scalability of
                 in-memory applications. For instance, using
                 measurements of memcached and SQLite on a desktop
                 machine, we obtain accurate predictions of their
                 scalability on a server. Our extensive evaluation on a
                 large number of in-memory benchmarks shows that ESTIMA
                 has generally low prediction errors.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Muddukrishna:2016:GGO,
  author =       "Ananya Muddukrishna and Peter A. Jonsson and Artur
                 Podobas and Mats Brorsson",
  title =        "Grain graphs: {OpenMP} performance analysis made
                 easy",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "28:1--28:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851156",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Average programmers struggle to solve performance
                 problems in OpenMP programs with tasks and parallel
                 for-loops. Existing performance analysis tools
                 visualize OpenMP task performance from the runtime
                 system's perspective where task execution is
                 interleaved with other tasks in an unpredictable order.
                 Problems with OpenMP parallel for-loops are similarly
                 difficult to resolve since tools only visualize
                 aggregate thread-level statistics such as load
                 imbalance without zooming into a per-chunk granularity.
                 The runtime system/threads oriented visualization
                 provides poor support for understanding problems with
                 task and chunk execution time, parallelism, and memory
                 hierarchy utilization, forcing average programmers to
                 rely on experts or use tedious trial-and-error tuning
                 methods for performance. We present grain graphs, a new
                 OpenMP performance analysis method that visualizes
                 grains --- computation performed by a task or a
                 parallel for-loop chunk instance --- and highlights
                 problems such as low parallelism, work inflation and
                 poor parallelization benefit at the grain level. We
                 demonstrate that grain graphs can quickly reveal
                 performance problems that are difficult to detect and
                 characterize in fine detail using existing
                 visualizations in standard OpenMP programs, simplifying
                 OpenMP performance analysis. This enables average
                 programmers to make portable optimizations for poor
                 performing OpenMP programs, reducing pressure on
                 experts and removing the need for tedious
                 trial-and-error tuning.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Machado:2016:PGC,
  author =       "Nuno Machado and Brandon Lucia and Lu{\'\i}s
                 Rodrigues",
  title =        "Production-guided concurrency debugging",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "29:1--29:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851149",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrency bugs that stem from schedule-dependent
                 branches are hard to understand and debug, because
                 their root causes imply not only different event
                 orderings, but also changes in the control-flow between
                 failing and non-failing executions. We present Cortex:
                 a system that helps exposing and understanding
                 concurrency bugs that result from schedule-dependent
                 branches, without relying on information from failing
                 executions. Cortex preemptively exposes failing
                 executions by perturbing the order of events and
                 control-flow behavior in non-failing schedules from
                 production runs of a program. By leveraging this
                 information from production runs, Cortex synthesizes
                 executions to guide the search for failing schedules.
                 Production-guided search helps cope with the large
                 execution search space by targeting failing executions
                 that are similar to observed non-failing executions.
                 Evaluation on popular benchmarks shows that Cortex is
                 able to expose failing schedules with only a few
                 perturbations to non-failing executions, and takes a
                 practical amount of time.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Farooqui:2016:AAW,
  author =       "Naila Farooqui and Rajkishore Barik and Brian T. Lewis
                 and Tatiana Shpeisman and Karsten Schwan",
  title =        "Affinity-aware work-stealing for integrated {CPU--GPU}
                 processors",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "30:1--30:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851194",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent integrated CPU-GPU processors like Intel's
                 Broadwell and AMD's Kaveri support hardware CPU-GPU
                 shared virtual memory, atomic operations, and memory
                 coherency. This enables fine-grained CPU-GPU
                 work-stealing, but architectural differences between
                 the CPU and GPU hurt the performance of
                 traditionally-implemented work-stealing on such
                 processors. These architectural differences include
                 different clock frequencies, atomic operation costs,
                 and cache and shared memory latencies. This paper
                 describes a preliminary implementation of our
                 work-stealing scheduler, Libra, which includes
                 techniques to deal with these architectural differences
                 in integrated CPU-GPU processors. Libra's
                 affinity-aware techniques achieve significant
                 performance gains over classically-implemented
                 work-stealing. We show preliminary results using a
                 diverse set of nine regular and irregular workloads
                 running on an Intel Broadwell Core-M processor. Libra
                 currently achieves up to a 2$ \times $ performance
                 improvement over classical work-stealing, with a 20\%
                 average improvement.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Gindraud:2016:ICM,
  author =       "Fran{\c{c}}ois Gindraud and Fabrice Rastello and
                 Albert Cohen and Fran{\c{c}}ois Broquedis",
  title =        "An interval constrained memory allocator for the
                 {Givy} {GAS} runtime",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "31:1--31:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851195",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The shared memory model helps parallel programming
                 productivity, but it also has a high hardware cost and
                 imposes scalability constraints. Ultimately, higher
                 performance will use distributed memories, which scales
                 better but requires programmers to manually transfer
                 data between local memories, which is a complex task.
                 Distributed memories are also more energy efficient
                 than shared memories, and are used in a family of
                 embedded computing solutions called multi processor
                 system on chip (MPSoC).",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Chang:2016:PSF,
  author =       "Li-Wen Chang and Izzat {El Hajj} and Hee-Seok Kim and
                 Juan G{\'o}mez-Luna and Abdul Dakkak and Wen-mei Hwu",
  title =        "A programming system for future proofing performance
                 critical libraries",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "32:1--32:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851178",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Tangram, a programming system for writing
                 performance-portable programs. The language enables
                 programmers to write computation and composition
                 codelets, supported by tuning knobs and primitives for
                 expressing data parallelism and work decomposition. The
                 compiler and runtime use a set of techniques such as
                 hierarchical composition, coarsening, data placement,
                 tuning, and runtime selection based on input
                 characteristics and micro-profiling. The resulting
                 performance is competitive with optimized vendor
                 libraries.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Nielsen:2016:SLF,
  author =       "Jesper Puge Nielsen and Sven Karlsson",
  title =        "A scalable lock-free hash table with open addressing",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "33:1--33:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851196",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent data structures synchronized with locks do
                 not scale well with the number of threads. As more
                 scalable alternatives, concurrent data structures and
                 algorithms based on widely available, however advanced,
                 atomic operations have been proposed. These data
                 structures allow for correct and concurrent operations
                 without any locks. In this paper, we present a new
                 fully lock-free open addressed hash table with a
                 simpler design than prior published work. We split hash
                 table insertions into two atomic phases: first
                 inserting a value ignoring other concurrent operations,
                 then in the second phase resolve any duplicate or
                 conflicting values. Our hash table has a constant and
                 low memory usage that is less than existing lock-free
                 hash tables at a fill level of 33\% and above. The hash
                 table exhibits good cache locality. Compared to prior
                 art, our hash table results in 16\% and 15\% fewer L1
                 and L2 cache misses respectively, leading to 21\% fewer
                 memory stall cycles. Our experiments show that our hash
                 table scales close to linearly with the number of
                 threads and outperforms, in throughput, other lock-free
                 hash tables by 19\%.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Maier:2016:CHT,
  author =       "Tobias Maier and Peter Sanders and Roman Dementiev",
  title =        "Concurrent hash tables: fast and general?(!)",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "34:1--34:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851188",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent hash tables are one of the most important
                 concurrent data structures with numerous applications.
                 Since hash table accesses can dominate the execution
                 time of the overall application, we need
                 implementations that achieve good speedup.
                 Unfortunately, currently available concurrent hashing
                 libraries turn out to be far away from this requirement
                 in particular when contention on some elements occurs.
                 Our starting point for better performing data
                 structures is a fast and simple lock-free concurrent
                 hash table based on linear probing that is limited to
                 word-sized key-value types and does not support dynamic
                 size adaptation. We explain how to lift these
                 limitations in a provably scalable way and demonstrate
                 that dynamic growing has a performance overhead
                 comparable to the same generalization in sequential
                 hash tables. We perform extensive experiments comparing
                 the performance of our implementations with six of the
                 most widely used concurrent hash tables. Ours are
                 considerably faster than the best algorithms with
                 similar restrictions and an order of magnitude faster
                 than the best more general tables. In some extreme
                 cases, the difference even approaches four orders of
                 magnitude.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Prades:2016:CAX,
  author =       "Javier Prades and Carlos Rea{\~n}o and Federico
                 Silla",
  title =        "{CUDA} acceleration for {Xen} virtual machines in
                 {InfiniBand} clusters with {rCUDA}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "35:1--35:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851181",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many data centers currently use virtual machines (VMs)
                 to achieve a more efficient usage of hardware
                 resources. However, current virtualization solutions,
                 such as Xen, do not easily provide graphics processing
                 unit (GPU) accelerators to applications running in the
                 virtualized domain with the flexibility usually
                 required in data centers (i.e., managing virtual GPU
                 instances and concurrently sharing them among several
                 VMs). Remote GPU virtualization frameworks such as the
                 rCUDA solution may address this problem. In this work
                 we analyze the use of the rCUDA framework to accelerate
                 scientific applications running inside Xen VMs. Results
                 show that the use of the rCUDA framework is a feasible
                 approach, featuring a very low overhead if an
                 InfiniBand fabric is already present in the cluster.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Umar:2016:EPF,
  author =       "Ibrahim Umar and Otto J. Anshus and Phuong H. Ha",
  title =        "Effect of portable fine-grained locality on energy
                 efficiency and performance in concurrent search trees",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "36:1--36:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851186",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent research has suggested that improving
                 fine-grained data-locality is one of the main
                 approaches to improving energy efficiency and
                 performance. However, no previous research has
                 investigated the effect of the approach on these
                 metrices in the case of concurrent data structures.
                 This paper investigates how fine-grained data locality
                 influences energy efficiency and performance in
                 concurrent search trees, a crucial data structure that
                 is widely used in several important systems. We conduct
                 a set of experiments on three lock-based concurrent
                 search trees: DeltaTree, a portable fine-grained
                 locality-aware concurrent search tree; CBTree, a
                 coarse-grained locality-aware B+tree; and BST-TK, a
                 locality-oblivious concurrent search tree. We run the
                 experiments on a commodity x86 platform and an embedded
                 ARM platform. The experimental results show that
                 DeltaTree has 13--25\% better energy efficiency and
                 10--22\% more operations/second on the x86 and ARM
                 platforms, respectively. The results confirm that
                 portable fine-grained locality can improve energy
                 efficiency and performance in concurrent search
                 trees.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Parikh:2016:EDW,
  author =       "Hrushit Parikh and Vinit Deodhar and Ada Gavrilovska
                 and Santosh Pande",
  title =        "Efficient distributed workstealing via matchmaking",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "37:1--37:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851175",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many classes of high-performance applications and
                 combinatorial problems exhibit large degree of runtime
                 load variability. One approach to achieving balanced
                 resource use is to over decompose the problem on
                 fine-grained tasks that are then dynamically balanced
                 using approaches such as workstealing. Existing work
                 stealing techniques for such irregular applications,
                 running on large clusters, exhibit high overheads due
                 to potential untimely interruption of busy nodes,
                 excessive communication messages and delays experienced
                 by idle nodes in finding work due to repeated failed
                 steals. We contend that the fundamental problem of
                 distributed work-stealing is of rapidly bringing
                 together work producers and consumers. In response, we
                 develop an algorithm that performs timely, lightweight
                 and highly efficient matchmaking between work producers
                 and consumers which results in accurate load balance.
                 Experimental evaluations show that our scheduler is
                 able to outperform other distributed work stealing
                 schedulers, and to achieve scale beyond what is
                 possible with current approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Luo:2016:DCC,
  author =       "Hao Luo and Guoyang Chen and Pengcheng Li and Chen
                 Ding and Xipeng Shen",
  title =        "Data-centric combinatorial optimization of parallel
                 code",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "38:1--38:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851182",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Memory performance is one essential factor for tapping
                 into the full potential of the massive parallelism of
                 GPU. It has motivated some recent efforts in GPU cache
                 modeling. This paper presents a new data-centric way to
                 model the performance of a system with heterogeneous
                 memory resources. The new model is composable, meaning
                 it can predict the performance difference due to
                 placing data differently by profiling the execution
                 just once.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Maleki:2016:DSD,
  author =       "Saeed Maleki and Donald Nguyen and Andrew Lenharth and
                 Mar{\'\i}a Garzar{\'a}n and David Padua and Keshav
                 Pingali",
  title =        "{DSMR}: a shared and distributed memory algorithm for
                 single-source shortest path problem",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "39:1--39:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851183",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The Single-Source Shortest Path (SSSP) problem is to
                 find the shortest paths from a source vertex to all
                 other vertices in a graph. In this paper, we introduce
                 the Dijkstra Strip-Mined Relaxation (DSMR) algorithm,
                 an efficient parallel SSSP algorithm for shared and
                 distributed memory systems. Our results show that, DSMR
                 is faster than parallel \Delta -Stepping by a factor of
                 up-to 1.66.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Salucci:2016:GMC,
  author =       "Luca Salucci and Daniele Bonetta and Stefan Marr and
                 Walter Binder",
  title =        "Generic messages: capability-based shared memory
                 parallelism for event-loop systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "40:1--40:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851184",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Systems based on event-loops have been popularized by
                 Node.JS, and are becoming a key technology in the
                 domain of cloud computing. Despite their popularity,
                 such systems support only share-nothing parallelism via
                 message passing between parallel entities usually
                 called workers. In this paper, we introduce a novel
                 parallel programming abstraction called Generic
                 Messages (GEMs), which enables shared-memory
                 parallelism for share-nothing event-based systems. A
                 key characteristic of GEMs is that they enable workers
                 to share state by specifying how the state can be
                 accessed once it is shared. We call this aspect of the
                 GEMs model capability-based parallelism.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Liu:2016:HCG,
  author =       "Jianqiao Liu and Nikhil Hegde and Milind Kulkarni",
  title =        "Hybrid {CPU--GPU} scheduling and execution of tree
                 traversals",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "41:1--41:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851174",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "GPUs offer the promise of massive, power-efficient
                 parallelism. However, exploiting this parallelism
                 requires code to be carefully structured to deal with
                 the limitations of the SIMT execution model. In recent
                 years, there has been much interest in mapping
                 irregular applications to GPUs: applications with
                 unpredictable, data-dependent behaviors. While most of
                 the work in this space has focused on ad hoc
                 implementations of specific algorithms, recent work has
                 looked at generic techniques for mapping a large class
                 of tree traversal algorithms to GPUs, through careful
                 restructuring of the tree traversal algorithms to make
                 them behave more regularly. Unfortunately, even this
                 general approach for GPU execution of tree traversal
                 algorithms is reliant on ad hoc, handwritten,
                 algorithm-specific scheduling ( i.e., assignment of
                 threads to warps) to achieve high performance. The key
                 challenge of scheduling is that it is a highly
                 irregular process, that requires the inspection of
                 thread behavior and then careful sorting of the threads
                 into warps. In this paper, we present a novel
                 scheduling and execution technique for tree traversal
                 algorithms that is both general and automatic. The key
                 novelty is a hybrid approach: the GPU partially
                 executes tasks to inspect thread behavior and transmits
                 information back to the CPU, which uses that
                 information to perform the scheduling itself, before
                 executing the remaining, carefully scheduled, portion
                 of the traversals on the GPU. We applied this framework
                 to five tree traversal algorithms, achieving
                 significant speedups over optimized GPU code that does
                 not perform application-specific scheduling. Further,
                 we show that in many cases, our hybrid approach is able
                 to deliver better performance even than GPU code that
                 uses hand-tuned, application-specific scheduling.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Ramachandran:2016:IEI,
  author =       "Arunmoezhi Ramachandran and Neeraj Mittal",
  title =        "Improving efficacy of internal binary search trees
                 using local recovery",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "42:1--42:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851173",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Binary Search Tree (BST) is an important data
                 structure for managing ordered data. Many
                 algorithms---blocking as well as non-blocking---have
                 been proposed for concurrent manipulation of a binary
                 search tree in an asynchronous shared memory system
                 that supports search, insert and delete operations
                 based on both external and internal representations of
                 a search tree. An important step in executing an
                 operation on a tree is to traverse the tree from
                 top-to-down in order to locate the operation's window.
                 A process may need to perform this traversal several
                 times to handle any failures occurring due to other
                 processes performing conflicting actions on the tree.
                 Most concurrent algorithms that have proposed so far
                 use a na{\"\i}ve approach and simply restart the
                 traversal from the root of the tree. In this work, we
                 present a new approach to recover from such failures
                 more efficiently in a concurrent binary search tree
                 based on internal representation using local recovery
                 by restarting the traversal from the ``middle'' of the
                 tree in order to locate an operation's window. Our
                 approach is sufficiently general in the sense that it
                 can be applied to a variety of concurrent binary search
                 trees based on both blocking and non-blocking
                 approaches. Using experimental evaluation, we
                 demonstrate that our local recovery approach can yield
                 significant speed-ups of up to 69\% for many concurrent
                 algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Merrill:2016:MBS,
  author =       "Duane Merrill and Michael Garland",
  title =        "Merge-based sparse matrix-vector multiplication
                 {(SpMV)} using the {CSR} storage format",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "43:1--43:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851190",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a perfectly balanced, ``merge-based''
                 parallel method for computing sparse matrix-vector
                 products (SpMV). Our algorithm operates directly upon
                 the Compressed Sparse Row (CSR) sparse matrix format, a
                 predominant in-memory representation for
                 general-purpose sparse linear algebra computations. Our
                 CsrMV performs an equitable multi-partitioning of the
                 input dataset, ensuring that no single thread can be
                 overwhelmed by assignment to (a) arbitrarily-long rows
                 or (b) an arbitrarily-large number of zero-length rows.
                 This parallel decomposition requires neither offline
                 preprocessing nor specialized/ancillary data formats.
                 We evaluate our method on both CPU and GPU
                 microarchitecture across an enormous corpus of diverse
                 real world matrix datasets. We show that traditional
                 CsrMV methods are inconsistent performers subject to
                 order-of-magnitude slowdowns, whereas the performance
                 response of our method is substantially impervious to
                 row-length heterogeneity.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Drebes:2016:NAS,
  author =       "Andi Drebes and Antoniu Pop and Karine Heydemann and
                 Nathalie Drach and Albert Cohen",
  title =        "{NUMA}-aware scheduling and memory allocation for
                 data-flow task-parallel applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "44:1--44:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851193",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic task parallelism is a popular programming
                 model on shared-memory systems. Compared to data
                 parallel loop-based concurrency, it promises enhanced
                 scalability, load balancing and locality. These
                 promises, however, are undermined by non-uniform memory
                 access (NUMA) systems. We show that it is possible to
                 preserve the uniform hardware abstraction of
                 contemporary task-parallel programming models, for both
                 computing and memory resources, while achieving
                 near-optimal data locality. Our run-time algorithms for
                 NUMA-aware task and data placement are fully automatic,
                 application-independent, performance-portable across
                 NUMA machines, and adapt to dynamic changes. Placement
                 decisions use information about inter-task data
                 dependences and reuse. This information is readily
                 available in the run-time systems of modern
                 task-parallel programming frameworks, and from the
                 operating system regarding the placement of previously
                 allocated memory. Our algorithms take advantage of
                 data-flow style task parallelism, where the
                 privatization of task data enhances scalability through
                 the elimination of false dependences and enables
                 fine-grained dynamic control over the placement of
                 application data. We demonstrate that the benefits of
                 dynamically managing data placement outweigh the
                 privatization cost, even when comparing with
                 target-specific optimizations through static,
                 NUMA-aware data interleaving. Our implementation and
                 the experimental evaluation on a set of
                 high-performance benchmarks executing on a 192-core
                 system with 24 NUMA nodes show that the fraction of
                 local memory accesses can be increased to more than
                 99\%, resulting in a speedup of up to 5$ \times $
                 compared to a NUMA-aware hierarchical work-stealing
                 baseline.",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Mohamedin:2016:DNA,
  author =       "Mohamed Mohamedin and Roberto Palmieri and Sebastiano
                 Peluso and Binoy Ravindran",
  title =        "On designing {NUMA}-aware concurrency control for
                 scalable transactional memory",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "45:1--45:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851189",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "NUMA architectures posed the challenge of rethinking
                 parallel applications due to the non-homogeneity
                 introduced by their design, and their real benefits are
                 limited to the characteristics of the particular
                 workload. We name as partitionable transactional
                 workloads such workloads that may be able to exploit
                 the distributed nature of NUMA, such as transactional
                 workloads where data and accesses can be easily
                 partitioned among the so called NUMA zones. However, in
                 case those workloads require the synchronization on
                 shared data, we have to face the issue of exploiting
                 the NUMA architecture also in the concurrency control
                 for their transactions. Therefore in this paper we
                 present a NUMA-aware concurrency control for
                 transactional memory that we designed for promoting
                 scalability in scenarios where both the transactional
                 workload is prone to scale, and the characteristics of
                 the underlying memory model are inherently non-uniform,
                 such as NUMA architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Saad:2016:OTC,
  author =       "Mohamed M. Saad and Roberto Palmieri and Binoy
                 Ravindran",
  title =        "On ordering transaction commit",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "46:1--46:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851191",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this poster paper, we briefly introduce an
                 effective solution to address the problem of committing
                 transactions enforcing a predefined order. To do that,
                 we overview the design of two algorithms that deploy a
                 cooperative transaction execution that circumvents the
                 transaction isolation constraint in favor of
                 propagating written values among conflicting
                 transactions. A preliminary implementation shows that
                 even in the presence of data conflicts, the proposed
                 algorithms outperform other competitors,
                 significantly.",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Qian:2016:ODG,
  author =       "Xuehai Qian and Koushik Sen and Paul Hargrove and
                 Costin Iancu",
  title =        "{OPR}: deterministic group replay for one-sided
                 communication",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "47:1--47:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851179",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The ability to reproduce a parallel execution is
                 desirable for debugging and program reliability
                 purposes. In debugging (13), the programmer needs to
                 manually step back in time, while for resilience (6)
                 this is automatically performed by the application upon
                 failure. To be useful, replay has to faithfully
                 reproduce the original execution. For parallel programs
                 the main challenge is inferring and maintaining the
                 order of conflicting operations (data races).
                 Deterministic record and replay (R{\&}R) techniques
                 have been developed for multithreaded shared memory
                 programs (5), as well as distributed memory programs
                 (14). Our main interest is techniques for large scale
                 scientific (3; 4) programming models.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Rabozzi:2016:PAP,
  author =       "Marco Rabozzi and Matteo Mazzucchelli and Roberto
                 Cordone and Giovanni Matteo Fumarola and Marco D.
                 Santambrogio",
  title =        "Preemption-aware planning on big-data systems",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "48:1--48:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851187",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent developments in Big Data frameworks are moving
                 towards reservation based approaches as a mean to
                 manage the increasingly complex mix of computations,
                 whereas preemption techniques are employed to meet
                 strict jobs deadlines. Within this work we propose and
                 evaluate a new planning algorithm in the context of
                 reservation based scheduling. Our approach is able to
                 achieve high cluster utilization while minimizing the
                 need for preemption that causes system overheads and
                 planning mispredictions.",
  acknowledgement = ack-nhfb,
  articleno =    "48",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Chen:2016:SPN,
  author =       "Yifeng Chen and Kun Huang and Bei Wang and Guohui Li
                 and Xiang Cui",
  title =        "{Samsara Parallel}: a non-{BSP} parallel-in-time
                 model",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "49:1--49:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851185",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many time-dependent problems like molecular dynamics
                 of protein folding require a large number of time
                 steps. The latencies and overheads of common-purpose
                 clusters with accelerators are too big for
                 high-frequency iteration. We introduce an algorithmic
                 model called Samsara Parallel (or SP) which, unlike
                 BSP, relies on asynchronous communications and can
                 repeatedly return to earlier time steps to refine the
                 precision of computation. This also extends a line of
                 research called Parallel-in-Time in computational
                 chemistry and physics.",
  acknowledgement = ack-nhfb,
  articleno =    "49",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Zhang:2016:SAN,
  author =       "Mingzhe Zhang and Francis C. M. Lau and Cho-Li Wang
                 and Luwei Cheng and Haibo Chen",
  title =        "Scalable adaptive {NUMA}-aware lock: combining local
                 locking and remote locking for efficient concurrency",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "50:1--50:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851176",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scalable locking is a key building block for scalable
                 multi-threaded software. Its performance is especially
                 critical in multi-socket, multi-core machines with
                 non-uniform memory access (NUMA). Previous schemes such
                 as local locking and remote locking only perform well
                 under a certain level of contention, and often require
                 non-trivial tuning for a particular configuration.
                 Besides, for large NUMA systems, because of unmanaged
                 lock server's nomination, current distance-first NUMA
                 policies cannot perform satisfactorily. In this work,
                 we propose SANL, a locking scheme that can deliver high
                 performance under various contention levels by
                 adaptively switching between the local and the remote
                 lock scheme. Furthermore, we introduce a new NUMA
                 policy for the remote lock that jointly considers node
                 distances and server utilization when choosing lock
                 servers. A comparison with seven representative locking
                 schemes shows that SANL outperforms the others in most
                 contention situations. In one group test, SANL is 3.7
                 times faster than RCL lock and 17 times faster than
                 POSIX mutex.",
  acknowledgement = ack-nhfb,
  articleno =    "50",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Hegde:2016:SRS,
  author =       "Nikhil Hegde and Jianqiao Liu and Milind Kulkarni",
  title =        "{SPIRIT}: a runtime system for distributed irregular
                 tree applications",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "51:1--51:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851177",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Repeated, depth-first traversal of trees is a common
                 algorithmic pattern in an important set of applications
                 from diverse domains such as cosmological simulations,
                 data mining, and computer graphics. As these
                 applications operate over massive data sets, it is
                 often necessary to distribute the trees to process all
                 of the data. In this work, we introduce SPIRIT, a
                 runtime system to ease the writing of distributed tree
                 applications. SPIRIT automates the challenging tasks of
                 tree distribution, optimizing communication and
                 parallelizing independent computations. The common
                 algorithmic pattern in tree traversals is exploited to
                 effectively schedule parallel computations and improve
                 locality. As a result, pipeline parallelism in
                 distributed traversals is identified, which is
                 complemented by load-balancing, and locality-enhancing,
                 message aggregation optimizations. Evaluation of SPIRIT
                 on tree traversal in Point Correlation (PC) shows a
                 scalable system, achieving speedups upto 38x on a
                 16-node, 64 process system compared to a 1-node,
                 baseline configuration. We also find that SPIRIT
                 results in substantially less communication and
                 achieves significant performance improvements over
                 implementations in other distributed graph systems.",
  acknowledgement = ack-nhfb,
  articleno =    "51",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Ramalhete:2016:TME,
  author =       "Pedro Ramalhete and Andreia Correia",
  title =        "{Tidex}: a mutual exclusion lock",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "52:1--52:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851171",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Several basic mutual exclusion lock algorithms are
                 known, with one of the simplest being the Ticket Lock.
                 We present a new mutual exclusion lock with properties
                 similar to the Ticket Lock but using atomic_exchange()
                 instead of atomic_fetch_add() that can be more
                 efficient on systems without a native instruction for
                 atomic_fetch_add(), or in which the native instruction
                 for atomic_exchange() is faster than the one for
                 atomic_fetch_add(). Similarly to the Ticket Lock, our
                 lock has small memory foot print, is extremely simple,
                 respects FIFO order, and provides starvation freedom in
                 architectures that implement atomic_exchange() as a
                 single instruction, like x86.",
  acknowledgement = ack-nhfb,
  articleno =    "52",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Mastoras:2016:UFC,
  author =       "Aristeidis Mastoras and Thomas R. Gross",
  title =        "Unifying fixed code and fixed data mapping of
                 load-imbalanced pipelined loops",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "53:1--53:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851172",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Some loops with cross-iteration dependences can
                 execute in parallel by pipelining. The loop body is
                 partitioned into stages such that the data dependences
                 are not violated and then the stages are mapped onto
                 threads. Two well-known mapping techniques are fixed
                 code and fixed data; they achieve high performance for
                 load-balanced loops, but they fail to perform well for
                 load-imbalanced loops. In this article, we present a
                 novel hybrid mapping that eliminates drawbacks of both
                 prior mapping techniques and enables dynamic scheduling
                 of stages.",
  acknowledgement = ack-nhfb,
  articleno =    "53",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Kurt:2016:UAS,
  author =       "Mehmet Can Kurt and Bin Ren and Sriram Krishnamoorthy
                 and Gagan Agrawal",
  title =        "User-assisted storage reuse determination for dynamic
                 task graphs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "54:1--54:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851180",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Models based on task graphs that operate on
                 single-assignment data are attractive in several ways,
                 but also require nuanced algorithms for scheduling and
                 memory management for efficient execution. In this
                 paper, we consider memory-efficient dynamic scheduling
                 of task graphs, and present a novel approach for
                 dynamically recycling the memory locations assigned to
                 data items as they are produced by tasks.",
  acknowledgement = ack-nhfb,
  articleno =    "54",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Rehman:2016:VMJ,
  author =       "Waqas Ur Rehman and Muhammad Sohaib Ayub and Junaid
                 Haroon Siddiqui",
  title =        "Verification of {MPI} {Java} programs using software
                 model checking",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "8",
  pages =        "55:1--55:??",
  month =        aug,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3016078.2851192",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Development of concurrent software requires the
                 programmer to be aware of non-determinism, data races,
                 and deadlocks. MPI (message passing interface) is a
                 popular standard for writing message oriented
                 distributed applications. Some messages in MPI systems
                 can be processed by one of the many machines and in
                 many possible orders. This non-determinism can affect
                 the result of an MPI application. The alternate results
                 may or may not be correct. To verify MPI applications,
                 we need to check all these possible orderings and use
                 an application specific oracle to decide if these
                 orderings give correct output. MPJ Express is an open
                 source Java implementation of the MPI standard. We
                 developed a Java based model of MPJ Express, where
                 processes are modeled as threads, and which can run
                 unmodified MPI Java programs on a single system. This
                 enabled us to adapt the Java PathFinder explicit state
                 software model checker (JPF) using a custom listener to
                 verify our model running real MPI Java programs. We
                 evaluated our approach using small examples where model
                 checking revealed message orders that would result in
                 incorrect system behavior.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '16 conference proceedings.",
}

@Article{Sarkar:2016:VEC,
  author =       "Vivek Sarkar",
  title =        "Virtualizing the Edge of the Cloud: the New Frontier",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "1--1",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892243",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the last two decades, virtualization technologies
                 have turned datacenter infrastructure into multitenant,
                 dynamically provisionable, elastic resource, and formed
                 the basis for the wide adoption of cloud computing.
                 Many of today's cloud applications, however, are based
                 on continuous interactions with end users and their
                 devices, and the trend is only expected to intensify
                 with the expansion of the Internet of Things. The
                 consequent bandwidth and latency requirements of these
                 emerging workloads push the cloud boundary outside of
                 traditional datacenters, giving rise to an edge tier in
                 the end-device-to-cloud-backend infrastructure.
                 Computational resources embedded in anything from
                 standalone microservers to WiFi routers and small cell
                 access points, and their open APIs, present
                 opportunities for deploying application logic and state
                 closer to where it is being used, addressing both
                 latency and backhaul bandwidth problems. This talk will
                 look at the role that existing virtualization
                 technologies can play in providing in this edge tier
                 the required flexibility, dynamic provisioning and
                 isolation, and will outline open problems that require
                 development of new solutions. We will also discuss the
                 opportunities to leverage these technologies to further
                 deal with the diversity in the end-user device and IoT
                 space.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Huang:2016:BKB,
  author =       "Yu-Ju Huang and Hsuan-Heng Wu and Yeh-Ching Chung and
                 Wei-Chung Hsu",
  title =        "Building a {KVM}-based Hypervisor for a Heterogeneous
                 System Architecture Compliant System",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "3--15",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892246",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Heterogeneous System Architecture (HSA) is an
                 architecture developed by the HSA foundation aiming at
                 reducing programmability barriers as well as improving
                 communication efficiency for heterogeneous computing.
                 For example, HSA allows heterogeneous computing
                 devices to share the same virtual address space. This
                 feature allows programmers to bypass explicit data
                 copying between devices, as was required in the past.
                 HSA features such as job dispatching through user level
                 queues and memory based signaling help to reduce
                 communication latency between the host and other
                 computing devices. While the new features in HSA enable
                 more efficient heterogeneous computing, they also
                 introduce new challenges to system virtualization,
                 especially in memory virtualization and I/O
                 virtualization. This work investigates the issues
                 involved in HSA virtualization and implements a
                 KVM-based hypervisor that supports the main features of
                 HSA inside guest operating systems. Furthermore, this
                 work shows that with the newly introduced hypervisor
                 for HSA, system resources in HSA-compliant AMD Kaveri
                 can be effectively shared between multiple guest
                 operating systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Ouyang:2016:SUV,
  author =       "Jiannan Ouyang and John R. Lange and Haoqiang Zheng",
  title =        "{Shoot4U}: Using {VMM} Assists to Optimize {TLB}
                 Operations on Preempted {vCPUs}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "17--23",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892245",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Virtual Machine based approaches to workload
                 consolidation, as seen in IaaS cloud as well as
                 datacenter platforms, have long had to contend with
                 performance degradation caused by synchronization
                 primitives inside the guest environments. These
                 primitives can be affected by virtual CPU preemptions
                 by the host scheduler that can introduce delays that
                 are orders of magnitude longer than those primitives
                 were designed for. While a significant amount of work
                 has focused on the behavior of spinlock primitives as a
                 source of these performance issues, spinlocks do not
                 represent the entirety of synchronization mechanisms
                 that are susceptible to scheduling issues when running
                 in a virtualized environment. In this paper we address
                 the virtualized performance issues introduced by TLB
                 shootdown operations. Our profiling study, based on the
                 PARSEC benchmark suite, has shown that up to 64\% of a
                 VM's CPU time can be spent on TLB shootdown operations
                 under certain workloads. In order to address this
                 problem, we present a paravirtual TLB shootdown scheme
                 named Shoot4U. Shoot4U completely eliminates TLB
                 shootdown preemptions by invalidating guest TLB entries
                 from the VMM and allowing guest TLB shootdown
                 operations to complete without waiting for remote
                 virtual CPUs to be scheduled. Our performance
                 evaluation using the PARSEC benchmark suite
                 demonstrates that Shoot4U can reduce benchmark runtime
                 by up to 85\% compared an unmodified Linux kernel, and
                 up to 44\% over a state-of-the-art paravirtual TLB
                 shootdown scheme.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Merrifield:2016:PIE,
  author =       "Timothy Merrifield and H. Reza Taheri",
  title =        "Performance Implications of Extended Page Tables on
                 Virtualized x86 Processors",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "25--35",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892258",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Managing virtual memory is an expensive operation, and
                 becomes even more expensive on virtualized servers.
                 Processing TLB misses on a virtualized x86 server
                 requires a two-dimensional page walk that can have 6x
                 more page table lookups, hence 6x more memory
                 references, than a native page table walk. Thus much of
                 the recent research on the subject starts from the
                 assumption that TLB miss processing in virtual
                 environments is significantly more expensive than on
                 native servers. However, we will show that with the
                 latest software stack on modern x86 processors, most of
                 these page-table lookups are satisfied by internal
                 paging structure caches and the L1/L2 data caches, and
                 the actual virtualization overhead of TLB miss
                 processing is a modest fraction of the overall time
                 spent processing TLB misses. In this paper, we present
                 a detailed accounting of the TLB miss processing costs
                 on virtualized x86 servers for an exhaustive set of
                 workloads, in particular, two very demanding industry
                 standard workloads. We show that an implementation of
                 the TPC-C workload that actively uses 475 GB of memory
                 on a 72-CPU Haswell-EP server spends 20\% of its time
                 processing TLB misses when the application runs in a
                 VM. Although this is a non-trivial amount, it is only
                 4.2\% higher than the TLB miss processing costs on bare
                 metal. The multi-VM VMmark benchmark sees 12.3\% in TLB
                 miss processing, but only 4.3\% of that can be
                 attributed to virtualization overheads. We show that
                 even for the heaviest workloads, a well-tuned
                 application that uses large pages on a recent OS
                 release with a modern hypervisor running on the latest
                 x86 processors sees only minimal degradation from the
                 additional overhead of the two-dimensional page walks
                 in a virtualized server.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Nathan:2016:SRO,
  author =       "Senthil Nathan and Umesh Bellur and Purushottam
                 Kulkarni",
  title =        "On Selecting the Right Optimizations for Virtual
                 Machine Migration",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "37--49",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892247",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "To reduce the migration time of a virtual machine and
                 network traffic generated during migration, existing
                 works have proposed a number of optimizations to
                 pre-copy live migration. These optimizations are delta
                 compression, page skip, deduplication, and data
                 compression. The cost-benefit analysis of these
                 optimizations may preclude the use of certain
                 optimizations in specific scenarios. However, no study
                 has compared the performance {\&} cost of these
                 optimizations, and identified the impact of application
                 behaviour on performance gain. Hence, it is not clear
                 for a given migration scenario and an application, what
                 is the best optimization that one must employ? In this
                 paper, we present a comprehensive empirical study using
                 a large number of workloads to provide recommendations
                 on selection of optimizations for pre-copy live
                 migration. The empirical study reveals that page skip
                 is an important optimization as it reduces network
                 traffic by 20\% with negligible additional CPU cost.
                 Data compression yields impressive gains in reducing
                 network traffic (37\%) but at the cost of a significant
                 increase in CPU consumption (5$ \times $).
                 De-duplication needs to be applied with utmost care as
                 the increase in CPU utilization might outweigh the
                 benefits considerably. The combination of page skip and
                 data compression works the best across workloads and
                 results in a significant reduction in network traffic
                 (40\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Abe:2016:UVM,
  author =       "Yoshihisa Abe and Roxana Geambasu and Kaustubh Joshi
                 and Mahadev Satyanarayanan",
  title =        "Urgent Virtual Machine Eviction with Enlightened
                 Post-Copy",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "51--64",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892252",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Virtual machine (VM) migration demands distinct
                 properties under resource oversubscription and workload
                 surges. We present enlightened post-copy, a new
                 mechanism for VMs under contention that evicts the
                 target VM with fast execution transfer and short total
                 duration. This design contrasts with common live
                 migration, which uses the down time of the migrated VM
                 as its primary metric; it instead focuses on recovering
                 the aggregate performance of the VMs being affected. In
                 enlightened post-copy, the guest OS identifies memory
                 state that is expected to encompass the VM's working
                 set. The hypervisor accordingly transfers its state,
                 mitigating the performance impact on the migrated VM
                 resulting from post-copy transfer. We show that our
                 implementation, with modest instrumentation in guest
                 Linux, resolves VM contention up to several times
                 faster than live migration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Xu:2016:SHS,
  author =       "Xin Xu and Bhavesh Davda",
  title =        "{SRVM}: Hypervisor Support for Live Migration with
                 Passthrough {SR-IOV} Network Devices",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "65--77",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892256",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Single-Root I/O Virtualization (SR-IOV) is a
                 specification that allows a single PCI Express (PCIe)
                 device (physical function or PF) to be used as multiple
                 PCIe devices (virtual functions or VF). In a
                 virtualization system, each VF can be directly assigned
                 to a virtual machine (VM) in passthrough mode to
                 significantly improve the network performance. However,
                 VF passthrough mode is not compatible with live
                 migration, which is an essential capability that
                 enables many advanced virtualization features such as
                 high availability and resource provisioning. To solve
                 this problem, we design SRVM which provides hypervisor
                 support to ensure the VF device can be correctly used
                 by the migrated VM and the applications. SRVM is
                 implemented in the hypervisor without modification in
                 guest operating systems or guest VM drivers. Our
                 experimental results show that SRVM can effectively
                 migrate all memory state, and there is no data loss or
                 corruption in applications after live migration. SRVM
                 does not increase VM downtime. It only costs limited
                 resources (an extra CPU core), and there is no
                 significant runtime overhead in VM network
                 performance. In fact, since the VF can continue to be
                 used during the pre-copy phase, it offers network
                 throughput which is 9.6 times and network latency which
                 is 98\% lower compared to other solutions that switch
                 to para-virtualization mode during live migration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Williams:2016:EEH,
  author =       "Dan Williams and Yaohui Hu and Umesh Deshpande and
                 Piush K. Sinha and Nilton Bila and Kartik Gopalan and
                 Hani Jamjoom",
  title =        "Enabling Efficient Hypervisor-as-a-Service Clouds with
                 Ephemeral Virtualization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "79--92",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892254",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When considering a hypervisor, cloud providers must
                 balance conflicting requirements for simple, secure
                 code bases with more complex, feature-filled offerings.
                 This paper introduces Dichotomy, a new two-layer cloud
                 architecture in which the roles of the hypervisor are
                 split. The cloud provider runs a lean hyperplexor that
                 has the sole task of multiplexing hardware and running
                 more substantial hypervisors (called featurevisors)
                 that implement features. Cloud users choose
                 featurevisors from a selection of lightly-modified
                 hypervisors potentially offered by third-parties in an
                 ``as-a-service'' model for each VM. Rather than running
                 the featurevisor directly on the hyperplexor using
                 nested virtualization, Dichotomy uses a new
                 virtualization technique called ephemeral
                 virtualization which efficiently (and repeatedly)
                 transfers control of a VM between the hyperplexor and
                 featurevisor using memory mapping techniques. Nesting
                 overhead is only incurred when the VM is accessed by
                 the featurevisor. We have implemented Dichotomy in
                 KVM/QEMU and demonstrate average switching times of 80
                 ms, two to three orders of magnitude faster than live
                 VM migration. We show that, for the featurevisor
                 applications we evaluated, VMs hosted in Dichotomy
                 deliver up to 12\% better performance than those hosted
                 on nested hypervisors, and continue to show benefit
                 even when the featurevisor applications run as often as
                 every 2.5~seconds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Burtsev:2016:APV,
  author =       "Anton Burtsev and David Johnson and Mike Hibler and
                 Eric Eide and John Regehr",
  title =        "Abstractions for Practical Virtual Machine Replay",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "93--106",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892257",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Efficient deterministic replay of whole operating
                 systems is feasible and useful, so why isn't replay a
                 default part of the software stack? While implementing
                 deterministic replay is hard, we argue that the main
                 reason is the lack of general abstractions for
                 understanding and addressing the significant
                 engineering challenges involved in the development of a
                 replay engine for a modern VMM. We present a design
                 blueprint---a set of abstractions, general principles,
                 and low-level implementation details---for efficient
                 deterministic replay in a modern hypervisor. We build
                 and evaluate our architecture in Xen, a full-featured
                 hypervisor. Our architecture can be readily followed
                 and adopted, enabling replay as a ubiquitous part of a
                 modern virtualization stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{McKinley:2016:NGV,
  author =       "Kathryn S. McKinley",
  title =        "Next Generation Virtual Memory Management",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "107--107",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892244",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The goal of virtual memory is an abstraction of
                 infinite and private memory for every process.
                 Unfortunately, the insatiable memory demands of modern
                 applications increasingly violate this abstraction by
                 exposing capacity, bandwidth, and performance
                 limitations of modern hardware. Furthermore, emerging
                 memory technologies are likely to exacerbate this
                 problem. For instance, non-volatile memory differs from
                 DRAM due to its asymmetric read/write performance and
                 thus will likely be an addition rather than a drop-in
                 replacement for DRAM. This talk will describe these
                 problems and recent architecture and software
                 innovations that address of some of them. If adopted,
                 these solutions will impose substantial challenges for
                 operating system memory management, which has evolved
                 very slowly over the past 30 years. I will draw lessons
                 from the past 15 years of garbage collection advances
                 to suggest some promising directions for innovation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Qian:2016:EFS,
  author =       "Junjie Qian and Witawas Srisa-an and Sharad Seth and
                 Hong Jiang and Du Li and Pan Yi",
  title =        "Exploiting {FIFO} Scheduler to Improve Parallel
                 Garbage Collection Performance",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "109--121",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892248",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent studies have found that parallel garbage
                 collection performs worse with more CPUs and more
                 collector threads. As part of this work, we further
                 investigate this phenomenon and find that poor
                 scalability is worst in highly scalable Java
                 applications. Our investigation to find the causes
                 clearly reveals that efficient multi-threading in an
                 application can prolong the average object lifespan,
                 which results in less effective garbage collection. We
                 also find that prolonging lifespan is the direct result
                 of Linux's Completely Fair Scheduler due to its
                 round-robin like behavior that can increase the heap
                 contention between the application threads. Instead, if
                 we use pseudo first-in-first-out to schedule
                 application threads in large multicore systems, the
                 garbage collection scalability is significantly
                 improved while the time spent in garbage collection is
                 reduced by as much as 21\%. The average execution time
                 of the 24 Java applications used in our study is also
                 reduced by 11\%. Based on this observation, we propose
                 two approaches to optimally select scheduling policies
                 based on application scalability profile. Our first
                 approach uses the profile information from one
                 execution to tune the subsequent executions. Our second
                 approach dynamically collects profile information and
                 performs policy selection during execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Yu:2016:PAO,
  author =       "Yang Yu and Tianyang Lei and Weihua Zhang and Haibo
                 Chen and Binyu Zang",
  title =        "Performance Analysis and Optimization of Full Garbage
                 Collection in Memory-hungry Environments",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "123--130",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892251",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Garbage collection (GC), especially full GC, would
                 nontrivially impact overall application performance,
                 especially for those memory-hungry ones handling large
                 data sets. This paper presents an in-depth performance
                 analysis on the full GC performance of Parallel
                 Scavenge (PS), a state-of-the-art and the default
                 garbage collector in the HotSpot JVM, using traditional
                 and big-data applications running atop JVM on CPU
                 (e.g., Intel Xeon) and many-integrated cores (e.g.,
                 Intel Xeon i). The analysis uncovers that unnecessary
                 memory accesses and calculations during reference
                 updating in the compaction ase are the main causes of
                 lengthy full GC. To this end, this paper describes an
                 incremental query model for reference calculation,
                 which is further embodied with three schemes (namely
                 optimistic, sort-based and region-based) for different
                 query patterns. Performance evaluation shows that the
                 incremental query model leads to averagely 1.9X (up to
                 2.9X) in full GC and 19.3\% (up to 57.2\%) improvement
                 in application throughput, as well as 31.2\% reduction
                 in pause time over the vanilla PS collector on CPU, and
                 the numbers are 2.1X (up to 3.4X), 11.1\% (up to
                 41.2\%) and 34.9\% for Xeon i accordingly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Smith:2016:LMR,
  author =       "Rebecca Smith and Scott Rixner",
  title =        "Leveraging Managed Runtime Systems to Build, Analyze,
                 and Optimize Memory Graphs",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "131--143",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892253",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Optimizing memory management is a major challenge of
                 embedded systems programming, as memory is scarce.
                 Further, embedded systems often have heterogeneous
                 memory architectures, complicating the task of memory
                 allocation during both compilation and migration.
                 However, new opportunities for addressing these
                 challenges have been created by the recent emergence of
                 managed runtimes for embedded systems. By imposing
                 structure on memory, these systems have opened the
                 doors for new techniques for analyzing and optimizing
                 memory usage within embedded systems. This paper
                 presents GEM (Graphs of Embedded Memory), a tool which
                 capitalizes on the structure that managed runtime
                 systems provide in order to build memory graphs which
                 facilitate memory analysis and optimization. At GEM's
                 core are a set of fundamental graph transformations
                 which can be layered to support a wide range of use
                 cases, including interactive memory visualization,
                 de-duplication of objects and code, compilation for
                 heterogeneous memory architectures, and transparent
                 migration. Moreover, since the same underlying
                 infrastructure supports all of these orthogonal
                 functionalities, they can easily be applied together to
                 complement each other.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Ben-Yehuda:2016:NPM,
  author =       "Muli Ben-Yehuda and Orna Agmon Ben-Yehuda and Dan
                 Tsafrir",
  title =        "The nom Profit-Maximizing Operating System",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "145--160",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892250",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the near future, cloud providers will sell their
                 users virtual machines with CPU, memory, network, and
                 storage resources whose prices constantly change
                 according to market-driven supply and demand
                 conditions. Running traditional operating systems in
                 these virtual machines is a poor fit: traditional
                 operating systems are not aware of changing resource
                 prices and their sole aim is to maximize performance
                 with no consideration of costs. Consequently, they
                 yield low profits. We present nom, a profit-maximizing
                 operating system designed for cloud computing platforms
                 with dynamic resource prices. Applications running on
                 nom aim to maximize profits by optimizing
                 simultaneously for performance and resource costs. The
                 nom kernel provides them with direct access to the
                 underlying hardware and full control over their private
                 software stacks. Since nom applications know there is
                 no single ``best'' software stack, they adapt their
                 stacks' behavior on the fly according to the current
                 price of available resources and their private utility
                 from them, which differs between applications. We show
                 that in addition to achieving up to 3.9x better
                 throughput and up to 9.1x better latency, nom
                 applications yield up to 11.1x higher profits when
                 compared with the same applications running on Linux
                 and OSv.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Hale:2016:EHP,
  author =       "Kyle C. Hale and Peter A. Dinda",
  title =        "Enabling Hybrid Parallel Runtimes Through Kernel and
                 Virtualization Support",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "161--175",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892255",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In our hybrid runtime (HRT) model, a parallel runtime
                 system and the application are together transformed
                 into a specialized OS kernel that operates entirely in
                 kernel mode and can thus implement exactly its desired
                 abstractions on top of fully privileged hardware
                 access. We describe the design and implementation of
                 two new tools that support the HRT model. The first,
                 the Nautilus Aerokernel, is a kernel framework
                 specifically designed to enable HRTs for x64 and Xeon
                 Phi hardware. Aerokernel primitives are specialized for
                 HRT creation and thus can operate much faster, up to
                 two orders of magnitude faster, than related primitives
                 in Linux. Aerokernel primitives also exhibit much lower
                 variance in their performance, an important
                 consideration for some forms of parallelism. We have
                 realized several prototype HRTs, including one based on
                 the Legion runtime, and we provide application
                 macrobenchmark numbers for our Legion HRT. The second
                 tool, the hybrid virtual machine (HVM), is an extension
                 to the Palacios virtual machine monitor that allows a
                 single virtual machine to simultaneously support a
                 traditional OS and software stack alongside an HRT with
                 specialized hardware access. The HRT can be booted in a
                 time comparable to a Linux user process startup, and
                 functions in the HRT, which operate over the user
                 process's memory, can be invoked by the process with
                 latencies not much higher than those of a function
                 call.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Waldspurger:2016:SSL,
  author =       "Carl Waldspurger and Emery Berger and Abhishek
                 Bhattacharjee and Kevin Pedretti and Simon Peter and
                 Chris Rossbach",
  title =        "Sweet Spots and Limits for Virtualization",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "7",
  pages =        "177--177",
  month =        jul,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3007611.2892249",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:12 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This year at VEE, we added a panel to discuss the
                 state of virtualization: what problems are solved? what
                 problems are important? and what problems may not be
                 worth solving? The panelist are experts in areas
                 ranging from hardware virtualization up to
                 language-level virtualization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '16 conference proceedings.",
}

@Article{Nitu:2017:SBQ,
  author =       "Vlad Nitu and Pierre Olivier and Alain Tchana and
                 Daniel Chiba and Antonio Barbalace and Daniel Hagimont
                 and Binoy Ravindran",
  title =        "Swift Birth and Quick Death: Enabling Fast Parallel
                 Guest Boot and Destruction in the {Xen} Hypervisor",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "1--14",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050758",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The ability to quickly set up and tear down a virtual
                 machine is critical for today's cloud elasticity, as
                 well as in numerous other scenarios: guest
                 migration/consolidation, event-driven invocation of
                 micro-services, dynamically adaptive unikernel-based
                 applications, micro-reboots for security or stability,
                 etc. In this paper, we focus on the process of setting
                 up/freeing the hypervisor and host control layer data
                 structures at boot/destruction time, showing that it
                 does not scale in current virtualization solutions. In
                 addition to the direct overhead of long VM
                 set-up/destruction times, we demonstrate by
                 experimentation the indirect costs on real world auto
                 scaling systems. Focusing on the popular Xen
                 hypervisor, we identify three critical issues hindering
                 the scalability of the boot and destruction processes:
                 serialized boot, unscalable interactions with the
                 Xenstore at guest creation time, and remote NUMA memory
                 scrubbing at destruction time. For each of these issues
                 we present the design and implementation of a solution
                 in the Xen infrastructure: parallel boot with
                 fine-grained locking, caching of Xenstore data, and
                 local NUMA scrubbing. We evaluate these solutions using
                 micro-benchmarks, macro-benchmarks, and real world
                 datacenter traces. Results show that our work improves
                 the current Xen implementation by a significant factor,
                 for example macro-benchmarks indicate a speedup of more
                 than 4X in high-load scenarios.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Kuenzer:2017:UEC,
  author =       "Simon Kuenzer and Anton Ivanov and Filipe Manco and
                 Jose Mendes and Yuri Volchkov and Florian Schmidt and
                 Kenichi Yasukata and Michio Honda and Felipe Huici",
  title =        "Unikernels Everywhere: The Case for Elastic {CDNs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "15--29",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050757",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Video streaming dominates the Internet's overall
                 traffic mix, with reports stating that it will
                 constitute 90\% of all consumer traffic by 2019. Most
                 of this video is delivered by Content Delivery Networks
                 (CDNs), and, while they optimize QoE metrics such as
                 buffering ratio and start-up time, no single CDN
                 provides optimal performance. In this paper we make the
                 case for elastic CDNs, the ability to build virtual
                 CDNs on-the-fly on top of shared, third-party
                 infrastructure at a scale. To bring this idea closer to
                 reality we begin by large-scale simulations to quantify
                 the effects that elastic CDNs would have if deployed,
                 and build and evaluate MiniCache, a specialized,
                 minimalistic virtualized content cache that runs on the
                 Xen hypervisor. MiniCache is able to serve content at
                 rates of up to 32 Gb/s and handle up to 600K reqs/sec
                 on a single CPU core, as well as boot in about 90
                 milliseconds on x86 and around 370 milliseconds on
                 ARM32.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Zhang:2017:MAP,
  author =       "Jinshi Zhang and Eddie Dong and Jian Li and Haibing
                 Guan",
  title =        "{MigVisor}: Accurate Prediction of {VM} Live Migration
                 Behavior using a Working-Set Pattern Model",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "30--43",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050753",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Live migration of a virtual machine (VM) is a powerful
                 technique with benefits of server maintenance, resource
                 management, dynamic workload re-balance, etc. Modern
                 research has effectively reduced the VM live migration
                 (VMLM) time to dozens of milliseconds, but live
                 migration still exhibits failures if it cannot
                 terminate within the given time constraint. The ability
                 to predict this type of failure can avoid wasting
                 networking and computing resources on the VM migration,
                 and the associated system performance degradation
                 caused by wasting these resources. The cost of VM live
                 migration highly depends on the application workload of
                 the VM, which may undergo frequent changes. At the same
                 time, the available system resources for VM migration
                 can also change substantially and frequently. To
                 account for these issues, we present a solution called
                 MigVisor, which can accurately predict the behaviour of
                 VM migration using working-set model. This can enable
                 system managers to predict the migration cost and
                 enhance the system management efficacy. The
                 experimental results prove the design suitability and
                 show that the MigVisor has a high prediction accuracy
                 since the average relative error between the predicted
                 value and the measured value is only 6.2\%~9\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Garg:2017:CGA,
  author =       "Anshuj Garg and Debadatta Mishra and Purushottam
                 Kulkarni",
  title =        "{Catalyst}: {GPU}-assisted rapid memory deduplication
                 in virtualization environments",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "44--59",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050760",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Content based page sharing techniques improve memory
                 efficiency in virtualized systems by identifying and
                 merging identical pages. Kernel Same-page Merging
                 (KSM), a Linux kernel utility for page sharing,
                 sequentially scans memory pages of virtual machines to
                 deduplicate pages. Sequential scanning of pages has
                 several undesirable side effects---wasted CPU cycles
                 when no sharing opportunities exist, and rate of
                 discovery of sharing being dependent on the scanning
                 rate and corresponding CPU availability. In this work,
                 we exploit presence of GPUs on modern systems to enable
                 rapid memory sharing through targeted scanning of
                 pages. Our solution, Catalyst, works in two phases, the
                 first where pages of virtual machines are processed by
                 the GPU to identify likely pages for sharing and a
                 second phase that performs page-level similarity checks
                 on a targeted set of shareable pages. Opportunistic
                 usage of the GPU to produce sharing hints enables rapid
                 and low-overhead duplicate detection, and sharing of
                 memory pages in virtualization environments. We
                 evaluate Catalyst against various benchmarks and
                 workloads to demonstrate that Catalyst can achieve
                 higher memory sharing in lesser time compared to
                 different scan rate configurations of KSM, at lower or
                 comparable compute costs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Fumero:2017:JTG,
  author =       "Juan Fumero and Michel Steuwer and Lukas Stadler and
                 Christophe Dubach",
  title =        "Just-In-Time {GPU} Compilation for Interpreted
                 Languages with Partial Evaluation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "60--73",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050761",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computer systems are increasingly featuring powerful
                 parallel devices with the advent of many-core CPUs and
                 GPUs. This offers the opportunity to solve
                 computationally-intensive problems at a fraction of the
                 time traditional CPUs need. However, exploiting
                 heterogeneous hardware requires the use of low-level
                 programming language approaches such as OpenCL, which
                 is incredibly challenging, even for advanced
                 programmers. On the application side, interpreted
                 dynamic languages are increasingly becoming popular in
                 many domains due to their simplicity, expressiveness
                 and flexibility. However, this creates a wide gap
                 between the high-level abstractions offered to
                 programmers and the low-level hardware-specific
                 interface. Currently, programmers must rely on high
                 performance libraries or they are forced to write parts
                 of their application in a low-level language like
                 OpenCL. Ideally, nonexpert programmers should be able
                 to exploit heterogeneous hardware directly from their
                 interpreted dynamic languages. In this paper, we
                 present a technique to transparently and automatically
                 offload computations from interpreted dynamic languages
                 to heterogeneous devices. Using just-in-time
                 compilation, we automatically generate OpenCL code at
                 runtime which is specialized to the actual observed
                 data types using profiling information. We demonstrate
                 our technique using R, which is a popular interpreted
                 dynamic language predominately used in big data
                 analytic. Our experimental results show the execution
                 on a GPU yields speedups of over 150x compared to the
                 sequential FastR implementation and the obtained
                 performance is competitive with manually written GPU
                 code. We also show that when taking into account
                 start-up time, large speedups are achievable, even when
                 the applications run for as little as a few seconds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Kotselidis:2017:HMR,
  author =       "Christos Kotselidis and James Clarkson and Andrey
                 Rodchenko and Andy Nisbet and John Mawer and Mikel
                 Luj{\'a}n",
  title =        "Heterogeneous Managed Runtime Systems: a Computer
                 Vision Case Study",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "74--82",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050764",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Real-time 3D space understanding is becoming prevalent
                 across a wide range of applications and hardware
                 platforms. To meet the desired Quality of Service
                 (QoS), computer vision applications tend to be heavily
                 parallelized and exploit any available hardware
                 accelerators. Current approaches to achieving real-time
                 computer vision, evolve around programming languages
                 typically associated with High Performance Computing
                 along with binding extensions for OpenCL or CUDA
                 execution. Such implementations, although high
                 performing, lack portability across the wide range of
                 diverse hardware resources and accelerators. In this
                 paper, we showcase how a complex computer vision
                 application can be implemented within a managed runtime
                 system. We discuss the complexities of achieving
                 high-performing and portable execution across embedded
                 and desktop configurations. Furthermore, we demonstrate
                 that it is possible to achieve the QoS target of over
                 30 frames per second (FPS) by exploiting FPGA and GPGPU
                 acceleration transparently through the managed runtime
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Deng:2017:DWT,
  author =       "Liang Deng and Peng Liu and Jun Xu and Ping Chen and
                 Qingkai Zeng",
  title =        "Dancing with Wolves: Towards Practical Event-driven
                 {VMM} Monitoring",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "83--96",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050750",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a novel framework that enables
                 practical event-driven monitoring for untrusted virtual
                 machine monitors (VMMs) in cloud computing. Unlike
                 previous approaches for VMM monitoring, our framework
                 neither relies on a higher privilege level nor requires
                 any special hardware support. Instead, we place the
                 trusted monitor at the same privilege level and in the
                 same address space with the untrusted VMM to achieve
                 superior efficiency, while proposing a unique
                 mutual-protection mechanism to ensure the integrity of
                 the monitor. Our security analysis demonstrates that
                 our framework can provide high-assurance for
                 event-driven VMM monitoring, even if the
                 highest-privilege VMM is fully compromised. The
                 experimental results show that our framework only
                 incurs trivial performance overhead for enforcing
                 event-driven monitoring policies, exhibiting tremendous
                 performance improvement on previous approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Suneja:2017:SIL,
  author =       "Sahil Suneja and Ricardo Koller and Canturk Isci and
                 Eyal de Lara and Ali Hashemi and Arnamoy Bhattacharyya
                 and Cristiana Amza",
  title =        "Safe Inspection of Live Virtual Machines",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "97--111",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050766",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With DevOps automation and an everything-as-code
                 approach to lifecycle management for cloud-native
                 applications, challenges emerge from an operational
                 visibility and control perspective. Once a VM is
                 deployed in production it typically becomes a hands-off
                 entity in terms of restrictions towards inspecting or
                 tuning it, for the fear of negatively impacting its
                 operation. We present CIVIC (Cloning and Injection
                 based VM Inspection for Cloud), a new mechanism that
                 enables safe inspection of unmodified production VMs
                 on-the-fly. CIVIC restricts all impact and side-effects
                 of inspection or analysis operations inside a live
                 clone of the production VM. New functionality over the
                 replicated VM state is introduced using code injection.
                 In this paper, we describe the design and
                 implementation of our solution over KVM/QEMU. We
                 demonstrate four of its use-cases-(i) safe reuse of
                 system monitoring agents, (ii) impact-heavy problem
                 diagnostics and troubleshooting, (iii) attaching an
                 intrusive anomaly detector to a live service, and (iv)
                 live tuning of a webserver's configuration parameters.
                 Our evaluation shows CIVIC is nimble and lightweight in
                 terms of memory footprint as well as clone activation
                 time (6.5s), and has a low impact on the original VM
                 ({$<$} 10\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Arulraj:2017:IVS,
  author =       "Leo Arulraj and Andrea C. Arpaci-Dusseau and Remzi H.
                 Arpaci-Dusseau",
  title =        "Improving Virtualized Storage Performance with Sky",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "112--128",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050755",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce Sky, an extension to the VMM that gathers
                 insights and information by intercepting system calls
                 made by guest applications. We show how Sky gains three
                 specific insights --- guest file-size information,
                 metadata-data distinction, and file-content hints ---
                 and uses said information to enhance
                 virtualized-storage performance. By caching small files
                 and metadata with higher priority, Sky reduces the
                 runtime by 2.3 to 8.8 times for certain workloads. Sky
                 also achieves 4.5 to 18.7 times reduction in the
                 runtime of an open-source block-layer deduplication
                 system by exploiting hints about file contents. Sky
                 works underneath both Linux and FreeBSD guests, as well
                 as under a range of file systems, thus enabling
                 portable and general VMM-level optimization underneath
                 a wide range of storage stacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Hetzelt:2017:SAE,
  author =       "Felicitas Hetzelt and Robert Buhren",
  title =        "Security Analysis of Encrypted Virtual Machines",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "129--142",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050763",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cloud computing has become indispensable in today's
                 computer landscape. The flexibility it offers for
                 customers as well as for providers has become a crucial
                 factor for large parts of the computer industry.
                 Virtualization is the key technology that allows for
                 sharing of hardware resources among different
                 customers. The controlling software component, called
                 hypervisor, provides a virtualized view of the computer
                 resources and ensures separation of different guest
                 virtual machines. However, this important cornerstone
                 of cloud computing is not necessarily trustworthy or
                 bug-free. To mitigate this threat AMD introduced Secure
                 Encrypted Virtualization, short SEV, which
                 transparently encrypts a virtual machines memory. In
                 this paper we analyse to what extend the proposed
                 features can resist a malicious hypervisor and discuss
                 the tradeoffs imposed by additional protection
                 mechanisms. To do so, we developed a model of SEV's
                 security capabilities based on the available
                 documentation as actual silicon implementations are not
                 yet on the market. We found that the first proposed
                 version of SEV is not up to the task owing to three
                 design shortcomings. First the virtual machine control
                 block is not encrypted and handled directly by the
                 hypervisor, allowing it to bypass VM memory encryption
                 by executing conveniently chosen gadgets. Secondly, the
                 general purpose registers are not encrypted upon
                 vmexit, leaking potentially sensitive data. Finally,
                 the control over the nested pagetables allows a
                 malicious hypervisor to closely monitor the execution
                 state of a VM and attack it with memory replay
                 attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Wang:2017:RLW,
  author =       "Zhe Wang and Chenggang Wu and Jianjun Li and Yuanming
                 Lai and Xiangyu Zhang and Wei-Chung Hsu and Yueqiang
                 Cheng",
  title =        "{ReRanz}: a Light-Weight Virtual Machine to Mitigate
                 Memory Disclosure Attacks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "143--156",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050752",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent code reuse attacks are able to circumvent
                 various address space layout randomization (ASLR)
                 techniques by exploiting memory disclosure
                 vulnerabilities. To mitigate sophisticated code reuse
                 attacks, we proposed a light-weight virtual machine,
                 ReRanz, which deployed a novel continuous binary code
                 re-randomization to mitigate memory disclosure oriented
                 attacks. In order to meet security and performance
                 goals, costly code randomization operations were
                 outsourced to a separate process, called the
                 ``shuffling process''. The shuffling process
                 continuously flushed the old code and replaced it with
                 a fine-grained randomized code variant. ReRanz repeated
                 the process each time an adversary might obtain the
                 information and upload a payload. Our performance
                 evaluation shows that ReRanz Virtual Machine incurs a
                 very low performance overhead. The security evaluation
                 shows that ReRanz successfully protect the Nginx web
                 server against the Blind-ROP attack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Estrada:2017:UDP,
  author =       "Zachary J. Estrada and Read Sprabery and Lok Yan and
                 Zhongzhi Yu and Roy Campbell and Zbigniew Kalbarczyk
                 and Ravishankar K. Iyer",
  title =        "Using {OS} Design Patterns to Provide Reliability and
                 Security as-a-Service for {VM-based} Clouds",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "157--170",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050759",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper extends the concepts behind cloud services
                 to offer hypervisor-based reliability and security
                 monitors for cloud virtual machines. Cloud VMs can be
                 heterogeneous and as such guest OS parameters needed
                 for monitoring can vary across different VMs and must
                 be obtained in some way. Past work involves running
                 code inside the VM, which is unacceptable for a cloud
                 environment. We solve this problem by recognizing that
                 there are common OS design patterns that can be used to
                 infer monitoring parameters from the guest OS. We
                 extract information about the cloud user's guest OS
                 with the user's existing VM image and knowledge of OS
                 design patterns as the only inputs to analysis. To
                 demonstrate the range of monitoring functionality
                 possible with this technique, we implemented four
                 sample monitors: a guest OS process tracer, an OS hang
                 detector, a return-to-user attack detector, and a
                 process-based keylogger detector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Hussein:2017:OPR,
  author =       "Ahmed Hussein and Mathias Payer and Antony L. Hosking
                 and Chris Vick",
  title =        "One Process to Reap Them All: Garbage Collection
                 as-a-Service",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "171--186",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Ubiquitous mobile platforms such as Android rely on
                 managed language run-time environments, also known as
                 language virtual machines (VMs), to run a diverse range
                 of user applications (apps). Each app runs in its own
                 private VM instance, and each VM makes its own private
                 local decisions in managing its use of processor and
                 memory resources. Moreover, the operating system and
                 the hardware do not communicate their low-level
                 decisions regarding power management with the
                 high-level app environment. This lack of coordination
                 across layers and across apps restricts more effective
                 global use of resources on the device. We address this
                 problem by devising and implementing a global memory
                 manager service for Android that optimizes memory
                 usage, run-time performance, and power consumption
                 globally across all apps running on the device. The
                 service focuses on the impact of garbage collection
                 (GC) along these dimensions, since GC poses a
                 significant overhead within managed run-time
                 environments. Our prototype collects system-wide
                 statistics from all running VMs, makes centralized
                 decisions about memory management across apps and
                 across software layers, and also collects garbage
                 centrally. Furthermore, the global memory manager
                 coordinates with the power manager to tune collector
                 scheduling. In our evaluation, we illustrate the impact
                 of such a central memory management service in reducing
                 total energy consumption (up to 18\%) and increasing
                 throughput (up to 12\%), and improving memory
                 utilization and adaptability to user activities.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Zhang:2017:DLN,
  author =       "Jie Zhang and Xiaoyi Lu and Dhabaleswar K. (DK)
                 Panda",
  title =        "Designing Locality and {NUMA} Aware {MPI} Runtime for
                 Nested Virtualization based {HPC} Cloud with {SR--IOV}
                 Enabled {InfiniBand}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "187--200",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050765",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Hypervisor-based virtualization solutions reveal good
                 security and isolation, while container-based solutions
                 make applications and workloads more portable and
                 distributed in an effective, standardized and
                 repeatable way. Therefore, nested virtualization based
                 computing environments (e.g., container over virtual
                 machine), which inherit the capabilities from both
                 solutions, are becoming more and more attractive in
                 clouds (e.g., running Docker over Amazon EC2 VMs).
                 Recent studies have shown that running applications in
                 either VMs or containers still has significant
                 overhead, especially for I/O intensive workloads. This
                 motivates us to investigate whether the nested
                 virtualization based solution can be adopted to build
                 high-performance computing (HPC) clouds for running MPI
                 applications efficiently and where the bottlenecks lie.
                 To eliminate performance bottlenecks, we propose a
                 high-performance two-layer locality and NUMA aware MPI
                 library, which is able to dynamically detect
                 co-resident containers inside one VM as well as detect
                 co-resident VM inside one host at MPI runtime. Thus the
                 MPI processes across different containers and VMs can
                 communicate to each other by shared memory or Cross
                 Memory Attach (CMA) channels instead of network channel
                 if they are co-resident. We further propose an enhanced
                 NUMA aware hybrid design to utilize InfiniBand loopback
                 based channel to optimize large message transfer across
                 containers when they are running on different sockets.
                 Performance evaluations show that compared with the
                 performance of the state-of-art (1Layer) design, our
                 proposed enhance-hybrid design can bring up to 184\%,
                 81\% and 12\% benefit on point-to-point, collective
                 operations, and end applications. Compared with the
                 default performance, our enhanced-hybrid design
                 delivers up to 184\%, 85\% and 16\% performance
                 improvement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Lu:2017:FPL,
  author =       "Kai Lu and Wenzhe Zhang and Xiaoping Wang and Mikel
                 Luj{\'a}n and Andy Nisbet",
  title =        "Flexible Page-level Memory Access Monitoring Based on
                 Virtualization Hardware",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "201--213",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050751",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Page protection is often used to achieve memory access
                 monitoring in many applications, dealing with
                 program-analysis, checkpoint-based failure recovery,
                 and garbage collection in managed runtime systems.
                 Typically, low overhead access monitoring is limited by
                 the relatively large page-level granularity of memory
                 management unit hardware support for virtual memory
                 protection. In this paper, we improve upon traditional
                 page-level mechanisms by additionally using hardware
                 support for virtualization in order to achieve fine and
                 flexible granularities that can be smaller than a page.
                 We first introduce a memory allocator based on page
                 protection that can achieve fine-grained monitoring.
                 Second, we explain how virtualization hardware support
                 can be used to achieve dynamic adjustment of the
                 monitoring granularity. In all, we propose a
                 process-level virtual machine to achieve dynamic and
                 fine-grained monitoring. Any application can run on our
                 process-level virtual machine without modification.
                 Experimental results for an incremental checkpoint tool
                 provide a use-case to demonstrate our work. Comparing
                 with traditional page-based checkpoint, our work can
                 effectively reduce the amount of checkpoint data and
                 improve performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Yang:2017:CLA,
  author =       "Chun Yang and Xianhua Liu and Xu Cheng",
  title =        "Content Look-Aside Buffer for Redundancy-Free Virtual
                 Disk {I/O} and Caching",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "214--227",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050762",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Storage consolidation in a virtualized environment
                 introduces numerous duplications in virtual disks and
                 imposes considerable pressure on disk I/O and caching.
                 In this paper, we present a content look-aside buffer
                 (CLB) approach for simultaneously providing
                 redundancy-free virtual disk I/O and caching. CLB
                 attaches persistent fingerprints to virtual disk
                 blocks, which enables detection of I/O redundancy
                 before disk access. At run time, CLB exploits content
                 pages already present in the guest disk caches to
                 service the redundant reads through page sharing, thus
                 eliminating both redundant I/O requests and redundant
                 disk cache copies. For write requests, CLB uses a group
                 invalidating writeback protocol for updating
                 fingerprints to support crash consistency while
                 minimizing disk write overhead. By implementing and
                 evaluating a CLB prototype on KVM hypervisor, we
                 demonstrate that CLB delivers considerably improved I/O
                 performance with realistic workloads. Our CLB prototype
                 improves the throughput of sequential and random read
                 on duplicate data by 4.1x and 26.2x, respectively. For
                 typical read-intensive workloads, such as booting VM
                 and launching application, CLB's I/O deduplication and
                 cache deduplication eliminates 94.9\%--98.5\% of read
                 requests and saves 50\%--100\% cache memory in each VM,
                 respectively. Compared with the QEMU's raw virtual disk
                 format, CLB improves the per-disk VM density by
                 8x--16x. For mixed read-write workloads, the cost of
                 on-line fingerprint updating offsets the read benefit;
                 nevertheless, CLB substantially improves overall
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{dAntras:2017:HXU,
  author =       "Amanieu d'Antras and Cosmin Gorgovan and Jim Garside
                 and John Goodacre and Mikel Luj{\'a}n",
  title =        "{HyperMAMBO-X64}: Using Virtualization to Support
                 High-Performance Transparent Binary Translation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "228--241",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050756",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Current computer architectures --- ARM, MIPS, PowerPC,
                 SPARC, x86 --- have evolved from a 32-bit architecture
                 to a 64-bit one. Computer architects often consider
                 whether it could be possible to eliminate hardware
                 support for a subset of the instruction set as to
                 reduce hardware complexity, which could improve
                 performance, reduce power usage and accelerate
                 processor development. This paper considers the
                 scenario where we want to eliminate 32-bit hardware
                 support from the ARMv8 architecture. Dynamic binary
                 translation can be used for this purpose and generally
                 comes in one of two forms: application-level
                 translators that translate a single user mode process
                 on top of a native operating system, and system-level
                 translators that translate an entire operating system
                 and all its processes. Application-level translators
                 can have good performance but is not totally
                 transparent; system-level translators may be 100\%
                 compatible but performance suffers. HyperMAMBO-X64 uses
                 a new approach that gets the best of both worlds, being
                 able to run the translator as an application under the
                 hypervisor but still react to the behavior of guest
                 operating systems. It works with complete transparency
                 with regards to the virtualized system whilst
                 delivering performance close to that provided by
                 hardware execution. A key factor in the low overhead of
                 HyperMAMBO-X64 is its deep integration with the
                 virtualization and memory management features of ARMv8.
                 These are exploited to support caching of translations
                 across multiple address spaces while ensuring that
                 translated code remains consistent with the source
                 instructions it is based on. We show how these
                 attributes are achieved without sacrificing either
                 performance or accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Zhu:2017:VLV,
  author =       "Min Zhu and Bibo Tu and Wei Wei and Dan Meng",
  title =        "{HA-VMSI}: a Lightweight Virtual Machine Isolation
                 Approach with Commodity Hardware for {ARM}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "7",
  pages =        "242--256",
  month =        jul,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140607.3050767",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:17 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Once compromising the hypervisor, remote or local
                 adversaries can easily access other customers'
                 sensitive data in the memory and context of guest
                 virtual machines (VMs). VM isolation is an efficient
                 mechanism for protecting the memory of guest VMs from
                 unauthorized access. However, previous VM isolation
                 systems either modify hardware architecture or
                 introduce a software module without being protected,
                 and most of them focus on the x86 architecture. This
                 paper proposes HA-VMSI, a lightweight hardware-assisted
                 VM isolation approach for ARM, to provide runtime
                 protection of guest VMs, even with a compromised
                 hypervisor. In the ARM TrustZone secure world, a thin
                 security monitor is introduced as HA-VMSI's entire TCB.
                 Hence, the security monitor is much less vulnerable and
                 safe from attacks that can compromise the hypervisor.
                 The key of HA-VMSI is decoupling the functions of
                 memory isolation among VMs from the hypervisor into the
                 security monitor. As a result, the hypervisor can only
                 update the Stage-2 page tables of VMs via the security
                 monitor, which inspects and approves each new mapping.
                 It is worth noting that HA-VMSI is more secure and
                 effective than current software approaches, and more
                 flexible and compatible than hardware approaches. We
                 have implemented a prototype for KVM hypervisor with
                 multiple Linux as guest OSes on Juno board. The
                 security assessment and performance evaluation show
                 that HA-VMSI is effective, efficient and practical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '17 conference proceedings.",
}

@Article{Steele:2017:TNO,
  author =       "Guy L. {Steele, Jr.}",
  title =        "It's Time for a New Old Language",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "1--1",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018773",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The most popular programming language in computer
                 science has no compiler or interpreter. Its definition
                 is not written down in any one place. It has changed a
                 lot over the decades, and those changes have introduced
                 ambiguities and inconsistencies. Today, dozens of
                 variations are in use, and its complexity has reached
                 the point where it needs to be re-explained, at least
                 in part, every time it is used. Much effort has been
                 spent in hand-translating between this language and
                 other languages that do have compilers. The language is
                 quite amenable to parallel computation, but this fact
                 has gone unexploited. In this talk we will summarize
                 the history of the language, highlight the variations
                 and some of the problems that have arisen, and propose
                 specific solutions. We suggest that it is high time
                 that this language be given a complete formal
                 specification, and that compilers, IDEs, and
                 proof-checkers be created to support it, so that all
                 the best tools and techniques of our trade may be
                 applied to it also.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Chen:2017:ESF,
  author =       "Guoyang Chen and Yue Zhao and Xipeng Shen and Huiyang
                 Zhou",
  title =        "{EffiSha}: a Software Framework for Enabling Efficient
                 Preemptive Scheduling of {GPU}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "3--16",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018748",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern GPUs are broadly adopted in many multitasking
                 environments, including data centers and smartphones.
                 However, the current support for the scheduling of
                 multiple GPU kernels (from different applications) is
                 limited, forming a major barrier for GPU to meet many
                 practical needs. This work for the first time
                 demonstrates that on existing GPUs, efficient
                 preemptive scheduling of GPU kernels is possible even
                 without special hardware support. Specifically, it
                 presents EffiSha, a pure software framework that
                 enables preemptive scheduling of GPU kernels with very
                 low overhead. The enabled preemptive scheduler offers
                 flexible support of kernels of different priorities,
                 and demonstrates significant potential for reducing the
                 average turnaround time and improving the system
                 overall throughput of programs that time share a modern
                 GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Cohen:2017:LLS,
  author =       "Nachshon Cohen and Arie Tal and Erez Petrank",
  title =        "Layout Lock: a Scalable Locking Paradigm for
                 Concurrent Data Layout Modifications",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "17--29",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018753",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data-structures can benefit from dynamic data layout
                 modifications when the size or the shape of the data
                 structure changes during the execution, or when
                 different phases in the program execute different
                 workloads. However, in a modern multi-core environment,
                 layout modifications involve costly synchronization
                 overhead. In this paper we propose a novel layout lock
                 that incurs a negligible overhead for reads and a small
                 overhead for updates of the data structure. We then
                 demonstrate the benefits of layout changes and also the
                 advantages of the layout lock as its supporting
                 synchronization mechanism for two data structures. In
                 particular, we propose a concurrent binary search tree,
                 and a concurrent array set, that benefit from
                 concurrent layout modifications using the proposed
                 layout lock. Experience demonstrates performance
                 advantages and integration simplicity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Zhang:2017:UGM,
  author =       "Xiuxia Zhang and Guangming Tan and Shuangbai Xue and
                 Jiajia Li and Keren Zhou and Mingyu Chen",
  title =        "Understanding the {GPU} Microarchitecture to Achieve
                 Bare-Metal Performance Tuning",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "31--43",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018755",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we present a methodology to understand
                 GPU microarchitectural features and improve performance
                 for compute-intensive kernels. The methodology relies
                 on a reverse engineering approach to crack the GPU ISA
                 encodings in order to build a GPU assembler. An
                 assembly microbenchmark suite correlates
                 microarchitectural features with their performance
                 factors to uncover instruction-level and memory
                 hierarchy preferences. We use SGEMM as a running
                 example to show the ways to achieve bare-metal
                 performance tuning. The performance boost is achieved
                 by tuning FFMA throughput by activating dual-issue,
                 eliminating register bank conflicts, adding non-FFMA
                 instructions with little penalty, and choosing proper
                 width of global/shared load instructions. On NVIDIA
                 Kepler K20m, we develop a faster SGEMM with 3.1Tflop/s
                 performance and 88\% efficiency; the performance is
                 15\% higher than cuBLAS7.0. Applying these
                 optimizations to convolution, the implementation gains
                 39\%--62\% performance improvement compared with
                 cuDNN4.0. The toolchain is an attempt to automatically
                 crack different GPU ISA encodings and build an
                 assembler adaptively for the purpose of performance
                 enhancements to applications on GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Ou:2017:CCD,
  author =       "Peizhao Ou and Brian Demsky",
  title =        "Checking Concurrent Data Structures Under the {C\slash
                 C++11} Memory Model",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "45--59",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018749",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent data structures often provide better
                 performance on multi-core processors but are
                 significantly more difficult to design and test than
                 their sequential counterparts. The C/C++11 standard
                 introduced a weak memory model with support for
                 low-level atomic operations such as compare and swap
                 (CAS). While low-level atomic operations can
                 significantly improve the performance of concurrent
                 data structures, they introduce non-intuitive behaviors
                 that can increase the difficulty of developing code. In
                 this paper, we develop a correctness model for
                 concurrent data structures that make use of atomic
                 operations. Based on this correctness model, we present
                 CDSSPEC, a specification checker for concurrent data
                 structures under the C/C++11 memory model. We have
                 evaluated CDSSPEC on 10 concurrent data structures,
                 among which CDSSPEC detected 3 known bugs and 93\% of
                 the injected bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Chabbi:2017:EAL,
  author =       "Milind Chabbi and Abdelhalim Amer and Shasha Wen and
                 Xu Liu",
  title =        "An Efficient Abortable-locking Protocol for
                 Multi-level {NUMA} Systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "61--74",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018768",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The popularity of Non-Uniform Memory Access (NUMA)
                 architectures has led to numerous locality-preserving
                 hierarchical lock designs, such as HCLH, HMCS, and
                 cohort locks. Locality-preserving locks trade fairness
                 for higher throughput. Hence, some instances of
                 acquisitions can incur long latencies, which may be
                 intolerable for certain applications. Few locks admit a
                 waiting thread to abandon its protocol on a timeout.
                 State-of-the-art abortable locks are not fully locality
                 aware, introduce high overheads, and unsuitable for
                 frequent aborts. Enhancing locality-aware locks with
                 lightweight timeout capability is critical for their
                 adoption. In this paper, we design and evaluate the
                 HMCS-T lock, a Hierarchical MCS (HMCS) lock variant
                 that admits a timeout. HMCS-T maintains the locality
                 benefits of HMCS while ensuring aborts to be
                 lightweight. HMCS-T offers the progress guarantee
                 missing in most abortable queuing locks. Our
                 evaluations show that HMCS-T offers the timeout feature
                 at a moderate overhead over its HMCS analog. HMCS-T,
                 used in an MPI runtime lock, mitigated the poor
                 scalability of an MPI+OpenMP BFS code and resulted in
                 4.3x superior scaling.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Acar:2017:CSC,
  author =       "Umut A. Acar and Naama Ben-David and Mike Rainey",
  title =        "Contention in Structured Concurrency: Provably
                 Efficient Dynamic Non-Zero Indicators for Nested
                 Parallelism",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "75--88",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018762",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the past two decades, many concurrent data
                 structures have been designed and implemented. Nearly
                 all such work analyzes concurrent data structures
                 empirically, omitting asymptotic bounds on their
                 efficiency, partly because of the complexity of the
                 analysis needed, and partly because of the difficulty
                 of obtaining relevant asymptotic bounds: when the
                 analysis takes into account important practical
                 factors, such as contention, it is difficult or even
                 impossible to prove desirable bounds. In this paper, we
                 show that considering structured concurrency or relaxed
                 concurrency models can enable establishing strong
                 bounds, also for contention. To this end, we first
                 present a dynamic relaxed counter data structure that
                 indicates the non-zero status of the counter. Our data
                 structure extends a recently proposed data structure,
                 called SNZI, allowing our structure to grow dynamically
                 in response to the increasing degree of concurrency in
                 the system. Using the dynamic SNZI data structure, we
                 then present a concurrent data structure for
                 series-parallel directed acyclic graphs (sp-dags), a
                 key data structure widely used in the implementation of
                 modern parallel programming languages. The key
                 component of sp-dags is an in-counter data structure
                 that is an instance of our dynamic SNZI. We analyze the
                 efficiency of our concurrent sp-dags and in-counter
                 data structures under nested-parallel computing
                 paradigm. This paradigm offers a structured model for
                 concurrency. Under this model, we prove that our data
                 structures require amortized (1) shared memory steps,
                 including contention. We present an implementation and
                 an experimental evaluation that suggests that the
                 sp-dags data structure is practical and can perform
                 well in practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Sato:2017:NIT,
  author =       "Kento Sato and Dong H. Ahn and Ignacio Laguna and
                 Gregory L. Lee and Martin Schulz and Christopher M.
                 Chambreau",
  title =        "Noise Injection Techniques to Expose Subtle and
                 Unintended Message Races",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "89--101",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018767",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Debugging intermittently occurring bugs within MPI
                 applications is challenging, and message races, a
                 condition in which two or more sends race to match with
                 a receive, are one of the common root causes. Many
                 debugging tools have been proposed to help programmers
                 resolve them, but their runtime interference perturbs
                 the timing such that subtle races often cannot be
                 reproduced with debugging tools. We present novel noise
                 injection techniques to expose message races even under
                 a tool's control. We first formalize this race problem
                 in the context of non-deterministic parallel
                 applications and use this analysis to determine an
                 effective noise-injection strategy to uncover them. We
                 codified these techniques in NINJA (Noise INJection
                 Agent) that exposes these races without modification to
                 the application. Our evaluations on synthetic cases as
                 well as a real-world bug in Hypre-2.10.1 show that
                 NINJA significantly helps expose races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Luo:2017:TDS,
  author =       "Hao Luo and Pengcheng Li and Chen Ding",
  title =        "Thread Data Sharing in Cache: Theory and Measurement",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "103--115",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018759",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "On modern multi-core processors, independent workloads
                 often interfere with each other by competing for shared
                 cache space. However, for multi-threaded workloads,
                 where a single copy of data can be accessed by multiple
                 threads, the threads can cooperatively share cache.
                 Because data sharing consolidates the collective
                 working set of threads, the effective size of shared
                 cache becomes larger than it would have been when data
                 are not shared. This paper presents a new theory of
                 data sharing. It includes (1) a new metric called the
                 shared footprint to mathematically compute the amount
                 of data shared by any group of threads in any size
                 cache, and (2) a linear-time algorithm to measure
                 shared footprint by scanning the memory trace of a
                 multi-threaded program. The paper presents the
                 practical implementation and evaluates the new theory
                 using 14 PARSEC and SPEC OMP benchmarks, including an
                 example use of shared footprint in program
                 optimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Ren:2017:EVM,
  author =       "Bin Ren and Sriram Krishnamoorthy and Kunal Agrawal
                 and Milind Kulkarni",
  title =        "Exploiting Vector and Multicore Parallelism for
                 Recursive, Data- and Task-Parallel Programs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "117--130",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018763",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern hardware contains parallel execution resources
                 that are well-suited for data-parallelism-vector
                 units-and task parallelism-multicores. However, most
                 work on parallel scheduling focuses on one type of
                 hardware or the other. In this work, we present a
                 scheduling framework that allows for a unified
                 treatment of task- and data-parallelism. Our key
                 insight is an abstraction, task blocks, that uniformly
                 handles data-parallel iterations and task-parallel
                 tasks, allowing them to be scheduled on vector units or
                 executed independently as multicores. Our framework
                 allows us to define schedulers that can dynamically
                 select between executing task-blocks on vector units or
                 multicores. We show that these schedulers are
                 asymptotically optimal, and deliver the maximum amount
                 of parallelism available in computation trees. To
                 evaluate our schedulers, we develop program
                 transformations that can convert mixed data- and
                 task-parallel programs into task block-based programs.
                 Using a prototype instantiation of our scheduling
                 framework, we show that, on an 8-core system, we can
                 simultaneously exploit vector and multicore parallelism
                 to achieve $ 14 \times $--$ 108 \times $ speedup over
                 sequential baselines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Shudler:2017:IPC,
  author =       "Sergei Shudler and Alexandru Calotoiu and Torsten
                 Hoefler and Felix Wolf",
  title =        "Isoefficiency in Practice: Configuring and
                 Understanding the Performance of Task-based
                 Applications",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "131--143",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018770",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Task-based programming offers an elegant way to
                 express units of computation and the dependencies among
                 them, making it easier to distribute the computational
                 load evenly across multiple cores. However, this
                 separation of problem decomposition and parallelism
                 requires a sufficiently large input problem to achieve
                 satisfactory efficiency on a given number of cores.
                 Unfortunately, finding a good match between input size
                 and core count usually requires significant
                 experimentation, which is expensive and sometimes even
                 impractical. In this paper, we propose an automated
                 empirical method for finding the isoefficiency function
                 of a task-based program, binding efficiency, core
                 count, and the input size in one analytical expression.
                 This allows the latter two to be adjusted according to
                 given (realistic) efficiency objectives. Moreover, we
                 not only find (i) the actual isoefficiency function but
                 also (ii) the function one would yield if the program
                 execution was free of resource contention and (iii) an
                 upper bound that could only be reached if the program
                 was able to maintain its average parallelism throughout
                 its execution. The difference between the three helps
                 to explain low efficiency, and in particular, it helps
                 to differentiate between resource contention and
                 structural conflicts related to task dependencies or
                 scheduling. The insights gained can be used to
                 co-design programs and shared system resources.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Utterback:2017:POR,
  author =       "Robert Utterback and Kunal Agrawal and I-Ting Angelina
                 Lee and Milind Kulkarni",
  title =        "Processor-Oblivious Record and Replay",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "145--161",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018764",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Record-and-replay systems are useful tools for
                 debugging non-deterministic parallel programs by first
                 recording an execution and then replaying that
                 execution to produce the same access pattern. Existing
                 record-and-replay systems generally target thread-based
                 execution models, and record the behaviors and
                 interleavings of individual threads. Dynamic
                 multithreaded languages and libraries, such as the Cilk
                 family, OpenMP, TBB, etc., do not have a notion of
                 threads. Instead, these languages provide a
                 processor-oblivious model of programming, where
                 programs expose task-parallelism using high-level
                 constructs such as spawn/sync without regard to the
                 number of threads/cores available to run the program.
                 Thread-based record-and-replay would violate the
                 processor-oblivious nature of these programs, as they
                 incorporate the number of threads into the recorded
                 information, constraining the replayed execution to the
                 same number of threads. In this paper, we present a
                 processor-oblivious record-and-replay scheme for such
                 languages where record and replay can use different
                 number of processors and both are scheduled using work
                 stealing. We provide theoretical guarantees for our
                 record and replay scheme --- namely that record is
                 optimal for programs with one lock and replay is
                 near-optimal for all cases. In addition, we implemented
                 this scheme in the Cilk Plus runtime system and our
                 evaluation indicates that processor-obliviousness does
                 not cause substantial overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Prajapati:2017:SAA,
  author =       "Nirmal Prajapati and Waruna Ranasinghe and Sanjay
                 Rajopadhye and Rumen Andonov and Hristo Djidjev and
                 Tobias Grosser",
  title =        "Simple, Accurate, Analytical Time Modeling and Optimal
                 Tile Size Selection for {GPGPU} Stencils",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "163--177",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018744",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Stencil computations are an important class of compute
                 and data intensive programs that occur widely in
                 scientific and engineering applications. A number of
                 tools use sophisticated tiling, parallelization, and
                 memory mapping strategies, and generate code that
                 relies on vendor-supplied compilers. This code has a
                 number of parameters, such as tile sizes, that are then
                 tuned via empirical exploration. We develop a model
                 that guides such a choice. Our model is a simple set of
                 analytical functions that predict the execution time of
                 the generated code. It is deliberately optimistic,
                 since tile sizes and, moreover, the optimistic
                 assumptions are intended to enable we are targeting
                 modeling and parameter selections yielding highly tuned
                 codes. We experimentally validate the model on a number
                 of 2D and 3D stencil codes, and show that the root mean
                 square error in the execution time is less than 10\%
                 for the subset of the codes that achieve performance
                 within 20\% of the best. Furthermore, based on using
                 our model, we are able to predict tile sizes that
                 achieve a further improvement of 9\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Jiang:2017:CSM,
  author =       "Peng Jiang and Gagan Agrawal",
  title =        "Combining {SIMD} and Many\slash Multi-core Parallelism
                 for Finite State Machines with Enumerative
                 Speculation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "179--191",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018760",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "Finite State Machine (FSM) is the key kernel behind
                 many popular applications, including regular expression
                 matching, text tokenization, and Huffman decoding.
                 Parallelizing FSMs is extremely difficult because of
                 the strong dependencies and unpredictable memory
                 accesses. Previous efforts have largely focused on
                 multi-core parallelization, and used different
                 approaches, including {\em speculative\/} and {\em
                 enumerative\/} execution, both of which have been
                 effective but also have limitations. With increasing
                 width and improving flexibility in SIMD instruction
                 sets, this paper focuses on combining SIMD and
                 multi/many-core parallelism for FSMs. We have developed
                 a novel strategy, called {\em enumerative speculation}.
                 Instead of speculating on a single state as in
                 speculative execution or enumerating all possible
                 states as in enumerative execution, our strategy
                 speculates transitions from several possible states,
                 reducing the prediction overheads of speculation
                 approach and the large amount of redundant work in the
                 enumerative approach. A simple lookback approach
                 produces a set of guessed states to achieve high
                 speculation success rates in our enumerative
                 speculation. We evaluate our method with four popular
                 FSM applications: Huffman decoding, regular expression
                 matching, HTML tokenization, and Div7. We obtain up to
                 2.5x speedup using SIMD on one core and up to 95x
                 combining SIMD with 60 cores of an Intel Xeon Phi. On a
                 single core, we outperform the best single-state
                 speculative execution version by an average of 1.6x,
                 and in combining SIMD and many-core parallelism,
                 outperform enumerative execution by an average of 2x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Awan:2017:CCD,
  author =       "Ammar Ahmad Awan and Khaled Hamidouche and Jahanzeb
                 Maqbool Hashmi and Dhabaleswar K. Panda",
  title =        "{S-Caffe}: Co-designing {MPI} Runtimes and {Caffe} for
                 Scalable Deep Learning on Modern {GPU} Clusters",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "193--205",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018769",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Availability of large data sets like ImageNet and
                 massively parallel computation support in modern HPC
                 devices like NVIDIA GPUs have fueled a renewed interest
                 in Deep Learning (DL) algorithms. This has triggered
                 the development of DL frameworks like Caffe, Torch,
                 TensorFlow, and CNTK. However, most DL frameworks have
                 been limited to a single node. In order to scale out DL
                 frameworks and bring HPC capabilities to the DL arena,
                 we propose, S-Caffe; a scalable and distributed Caffe
                 adaptation for modern multi-GPU clusters. With an
                 in-depth analysis of new requirements brought forward
                 by the DL frameworks and limitations of current
                 communication runtimes, we present a co-design of the
                 Caffe framework and the MVAPICH2-GDR MPI runtime. Using
                 the co-design methodology, we modify Caffe's workflow
                 to maximize the overlap of computation and
                 communication with multi-stage data propagation and
                 gradient aggregation schemes. We bring DL-Awareness to
                 the MPI runtime by proposing a hierarchical reduction
                 design that benefits from CUDA-Aware features and
                 provides up to a massive 133x speedup over OpenMPI and
                 2.6x speedup over MVAPICH2 for 160 GPUs. S-Caffe
                 successfully scales up to 160 K-80 GPUs for GoogLeNet
                 (ImageNet) with a speedup of 2.5x over 32 GPUs. To the
                 best of our knowledge, this is the first framework that
                 scales up to 160 GPUs. Furthermore, even for single
                 node training, S-Caffe shows an improvement of 14\% and
                 9\% over Nvidia's optimized Caffe for 8 and 16 GPUs,
                 respectively. In addition, S-Caffe achieves up to 1395
                 samples per second for the AlexNet model, which is
                 comparable to the performance of Microsoft CNTK.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Sabne:2017:MBI,
  author =       "Amit Sabne and Xiao Wang and Sherman J. Kisner and
                 Charles A. Bouman and Anand Raghunathan and Samuel P.
                 Midkiff",
  title =        "Model-based Iterative {CT} Image Reconstruction on
                 {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "207--220",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018765",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Computed Tomography (CT) Image Reconstruction is an
                 important technique used in a variety of domains,
                 including medical imaging, electron microscopy,
                 non-destructive testing and transportation security.
                 Model-based Iterative Reconstruction (MBIR) using
                 Iterative Coordinate Descent (ICD) is a CT algorithm
                 that produces state-of-the-art results in terms of
                 image quality. However, MBIR is highly computationally
                 intensive and challenging to parallelize, and has
                 traditionally been viewed as impractical in
                 applications where reconstruction time is critical. We
                 present the first GPU-based algorithm for ICD-based
                 MBIR. The algorithm leverages the recently-proposed
                 concept of SuperVoxels, and efficiently exploits the
                 three levels of parallelism available in MBIR to better
                 utilize the GPU hardware resources. We also explore
                 data layout transformations to obtain more coalesced
                 accesses and several GPU-specific optimizations for
                 MBIR that boost performance. Across a suite of 3200
                 test cases, our GPU implementation obtains a geometric
                 mean speedup of 4.43X over a state-of-the-art
                 multi-core implementation on a 16-core iso-power CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Yeh:2017:PFG,
  author =       "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and
                 Rudolf Eigenmann and Timothy G. Rogers",
  title =        "{Pagoda}: Fine-Grained {GPU} Resource Virtualization
                 for Narrow Tasks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "221--234",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Massively multithreaded GPUs achieve high throughput
                 by running thousands of threads in parallel. To fully
                 utilize the hardware, workloads spawn work to the GPU
                 in bulk by launching large tasks, where each task is a
                 kernel that contains thousands of threads that occupy
                 the entire GPU. GPUs face severe underutilization and
                 their performance benefits vanish if the tasks are
                 narrow, i.e., they contain {$<$} 500 threads.
                 Latency-sensitive applications in network, signal, and
                 image processing that generate a large number of tasks
                 with relatively small inputs are examples of such
                 limited parallelism. This paper presents Pagoda, a
                 runtime system that virtualizes GPU resources, using an
                 OS-like daemon kernel called MasterKernel. Tasks are
                 spawned from the CPU onto Pagoda as they become
                 available, and are scheduled by the MasterKernel at the
                 warp granularity. Experimental results demonstrate that
                 Pagoda achieves a geometric mean speedup of 5.70x over
                 PThreads running on a 20-core CPU, 1.51x over
                 CUDA-HyperQ, and 1.69x over GeMTC, the state-of-
                 the-art runtime GPU task scheduling system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Ben-Nun:2017:GAM,
  author =       "Tal Ben-Nun and Michael Sutton and Sreepathi Pai and
                 Keshav Pingali",
  title =        "{Groute}: an Asynchronous Multi-{GPU} Programming
                 Model for Irregular Computations",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "235--248",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018756",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nodes with multiple GPUs are becoming the platform of
                 choice for high-performance computing. However, most
                 applications are written using bulk-synchronous
                 programming models, which may not be optimal for
                 irregular algorithms that benefit from low-latency,
                 asynchronous communication. This paper proposes
                 constructs for asynchronous multi-GPU programming, and
                 describes their implementation in a thin runtime
                 environment called Groute. Groute also implements
                 common collective operations and distributed
                 work-lists, enabling the development of irregular
                 applications without substantial programming effort. We
                 demonstrate that this approach achieves
                 state-of-the-art performance and exhibits strong
                 scaling for a suite of irregular applications on 8-GPU
                 and heterogeneous systems, yielding over 7x speedup for
                 some algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Schardl:2017:TEF,
  author =       "Tao B. Schardl and William S. Moses and Charles E.
                 Leiserson",
  title =        "{Tapir}: Embedding Fork-Join Parallelism into {LLVM}'s
                 Intermediate Representation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "249--265",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018758",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper explores how fork-join parallelism, as
                 supported by concurrency platforms such as Cilk and
                 OpenMP, can be embedded into a compiler's intermediate
                 representation (IR). Mainstream compilers typically
                 treat parallel linguistic constructs as syntactic sugar
                 for function calls into a parallel runtime. These calls
                 prevent the compiler from performing optimizations
                 across parallel control constructs. Remedying this
                 situation is generally thought to require an extensive
                 reworking of compiler analyses and code transformations
                 to handle parallel semantics. Tapir is a compiler IR
                 that represents logically parallel tasks asymmetrically
                 in the program's control flow graph. Tapir allows the
                 compiler to optimize across parallel control constructs
                 with only minor changes to its existing analyses and
                 code transformations. To prototype Tapir in the LLVM
                 compiler, for example, we added or modified about 6000
                 lines of LLVM's 4-million-line codebase. Tapir enables
                 LLVM's existing compiler optimizations for serial code
                 --- including loop-invariant-code motion,
                 common-subexpression elimination, and tail-recursion
                 elimination --- to work with parallel control
                 constructs such as spawning and parallel loops. Tapir
                 also supports parallel optimizations such as loop
                 scheduling.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Matveev:2017:MPC,
  author =       "Alexander Matveev and Yaron Meirovitch and Hayk
                 Saribekyan and Wiktor Jakubiuk and Tim Kaler and
                 Gergely Odor and David Budden and Aleksandar Zlateski
                 and Nir Shavit",
  title =        "A Multicore Path to Connectomics-on-Demand",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "267--281",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018766",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The current design trend in large scale machine
                 learning is to use distributed clusters of CPUs and
                 GPUs with MapReduce-style programming. Some have been
                 led to believe that this type of horizontal scaling can
                 reduce or even eliminate the need for traditional
                 algorithm development, careful parallelization, and
                 performance engineering. This paper is a case study
                 showing the contrary: that the benefits of algorithms,
                 parallelization, and performance engineering, can
                 sometimes be so vast that it is possible to solve
                 ``cluster-scale'' problems on a single commodity
                 multicore machine. Connectomics is an emerging area of
                 neurobiology that uses cutting edge machine learning
                 and image processing to extract brain connectivity
                 graphs from electron microscopy images. It has long
                 been assumed that the processing of connectomics data
                 will require mass storage, farms of CPU/GPUs, and will
                 take months (if not years) of processing time. We
                 present a high-throughput connectomics-on-demand system
                 that runs on a multicore machine with less than 100
                 cores and extracts connectomes at the terabyte per hour
                 pace of modern electron microscopes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Vollmer:2017:SHS,
  author =       "Michael Vollmer and Ryan G. Scott and Madanlal
                 Musuvathi and Ryan R. Newton",
  title =        "{SC-Haskell}: Sequential Consistency in Languages That
                 Minimize Mutable Shared Heap",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "283--298",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018746",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A core, but often neglected, aspect of a programming
                 language design is its memory (consistency) model.
                 Sequential consistency~(SC) is the most intuitive
                 memory model for programmers as it guarantees
                 sequential composition of instructions and provides a
                 simple abstraction of shared memory as a single global
                 store with atomic read and writes. Unfortunately, SC is
                 widely considered to be impractical due to its
                 associated performance overheads. Perhaps contrary to
                 popular opinion, this paper demonstrates that SC is
                 achievable with acceptable performance overheads for
                 mainstream languages that minimize mutable shared heap.
                 In particular, we modify the Glasgow Haskell Compiler
                 to insert fences on all writes to shared mutable memory
                 accessed in nonfunctional parts of the program. For a
                 benchmark suite containing 1,279 programs, SC adds a
                 geomean overhead of less than 0.4\% on an x86 machine.
                 The efficiency of SC arises primarily due to the
                 isolation provided by the Haskell type system between
                 purely functional and thread-local imperative
                 computations on the one hand, and imperative
                 computations on the global heap on the other. We show
                 how to use new programming idioms to further reduce the
                 SC overhead; these create a virtuous cycle of less
                 overhead and even stronger semantic guarantees (static
                 data-race freedom).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Battig:2017:SDC,
  author =       "Martin B{\"a}ttig and Thomas R. Gross",
  title =        "Synchronized-by-Default Concurrency for Shared-Memory
                 Systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "299--312",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018747",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We explore a programming approach for concurrency that
                 synchronizes all accesses to shared memory by default.
                 Synchronization takes place by ensuring that all
                 program code runs inside atomic sections even if the
                 program code has external side effects. Threads are
                 mapped to atomic sections that a programmer must
                 explicitly split to increase concurrency. A naive
                 implementation of this approach incurs a large amount
                 of overhead. We show how to reduce this overhead to
                 make the approach suitable for realistic application
                 programs on existing hardware. We present an
                 implementation technique based on a special-purpose
                 software transactional memory system. To reduce the
                 overhead, the technique exploits properties of managed,
                 object-oriented programming languages as well as
                 intraprocedural static analyses and uses field-level
                 granularity locking in combination with transactional
                 I/O to provide good scaling properties. We implemented
                 the synchronized-by-default (SBD) approach for the Java
                 language and evaluate its performance for six programs
                 from the DaCapo benchmark suite. The evaluation shows
                 that, compared to explicit synchronization, the SBD
                 approach has an overhead between 0.4\% and 102\%
                 depending on the benchmark and the number of threads,
                 with a mean (geom.) of 23.9\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Moreira:2017:FCR,
  author =       "Rubens E. A. Moreira and Sylvain Collange and Fernando
                 Magno Quint{\~a}o Pereira",
  title =        "Function Call Re-Vectorization",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "313--326",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018751",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "Programming languages such as C for CUDA, OpenCL or
                 ISPC have contributed to increase the programmability
                 of SIMD accelerators and graphics processing units.
                 However, these languages still lack the flexibility
                 offered by low-level SIMD programming on explicit
                 vectors. To close this expressiveness gap while
                 preserving performance, this paper introduces the
                 notion of \ourinvention{} (CREV). CREV allows changing
                 the dimension of vectorization during the execution of
                 a kernel, exposing it as a nested parallel kernel call.
                 CREV affords programmability close to dynamic
                 parallelism, a feature that allows the invocation of
                 kernels from inside kernels, but at much lower cost. In
                 this paper, we present a formal semantics of CREV, and
                 an implementation of it on the ISPC compiler. We have
                 used CREV to implement some classic algorithms,
                 including string matching, depth first search and
                 Bellman-Ford, with minimum effort. These algorithms,
                 once compiled by ISPC to Intel-based vector
                 instructions, are as fast as state-of-the-art
                 implementations, yet much simpler. Thus, CREV gives
                 developers the elegance of dynamic programming, and the
                 performance of explicit SIMD programming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Rajbhandari:2017:OFI,
  author =       "Samyam Rajbhandari and Fabrice Rastello and Karol
                 Kowalski and Sriram Krishnamoorthy and P. Sadayappan",
  title =        "Optimizing the Four-Index Integral Transform Using
                 Data Movement Lower Bounds Analysis",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "327--340",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018771",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The four-index integral transform is a fundamental and
                 computationally demanding calculation used in many
                 computational chemistry suites such as NWChem. It
                 transforms a four-dimensional tensor from one basis to
                 another. This transformation is most efficiently
                 implemented as a sequence of four tensor contractions
                 that each contract a four-dimensional tensor with a
                 two-dimensional transformation matrix. Differing
                 degrees of permutation symmetry in the intermediate and
                 final tensors in the sequence of contractions cause
                 intermediate tensors to be much larger than the final
                 tensor and limit the number of electronic states in the
                 modeled systems. Loop fusion, in conjunction with
                 tiling, can be very effective in reducing the total
                 space requirement, as well as data movement. However,
                 the large number of possible choices for loop fusion
                 and tiling, and data/computation distribution across a
                 parallel system, make it challenging to develop an
                 optimized parallel implementation for the four-index
                 integral transform. We develop a novel approach to
                 address this problem, using lower bounds modeling of
                 data movement complexity. We establish relationships
                 between available aggregate physical memory in a
                 parallel computer system and ineffective fusion
                 configurations, enabling their pruning and consequent
                 identification of effective choices and a
                 characterization of optimality criteria. This work has
                 resulted in the development of a significantly improved
                 implementation of the four-index transform that enables
                 higher performance and the ability to model larger
                 electronic systems than the current implementation in
                 the NWChem quantum chemistry software suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Steele:2017:UBP,
  author =       "Guy L. {Steele, Jr.} and Jean-Baptiste Tristan",
  title =        "Using Butterfly-Patterned Partial Sums to Draw from
                 Discrete Distributions",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "341--355",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018757",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We describe a SIMD technique for drawing values from
                 multiple discrete distributions, such as sampling from
                 the random variables of a mixture model, that avoids
                 computing a complete table of partial sums of the
                 relative probabilities. A table of alternate
                 (``butterfly-patterned'') form is faster to compute,
                 making better use of coalesced memory accesses; from
                 this table, complete partial sums are computed on the
                 fly during a binary search. Measurements using CUDA 7.5
                 on an NVIDIA Titan Black GPU show that this technique
                 makes an entire machine-learning application that uses
                 a Latent Dirichlet Allocation topic model with 1024
                 topics about about 13\% faster (when using
                 single-precision floating-point data) or about 35\%
                 faster (when using double-precision floating-point
                 data) than doing a straightforward matrix transposition
                 after using coalesced accesses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Basin:2017:KKV,
  author =       "Dmitry Basin and Edward Bortnikov and Anastasia
                 Braginsky and Guy Golan-Gueta and Eshcar Hillel and
                 Idit Keidar and Moshe Sulamy",
  title =        "{KiWi}: a Key--Value Map for Scalable Real-Time
                 Analytics",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "357--369",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018761",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern big data processing platforms employ huge
                 in-memory key--value (KV) maps. Their applications
                 simultaneously drive high-rate data ingestion and
                 large-scale analytics. These two scenarios expect
                 KV-map implementations that scale well with both
                 real-time updates and large atomic scans triggered by
                 range queries. We present KiWi, the first atomic KV-map
                 to efficiently support simultaneous large scans and
                 real-time access. The key to achieving this is treating
                 scans as first class citizens,and organizing the data
                 structure around them. KiWi provides wait-free scans,
                 whereas its put operations are lightweight and
                 lock-free. It optimizes memory management jointly with
                 data structure access.We implement KiWi and compare it
                 to state-of-the-art solutions. Compared to other
                 KV-maps providing atomic scans, KiWi performs either
                 long scans or concurrent puts an order of magnitude
                 faster. Its scans are twice as fast as non-atomic ones
                 implemented via iterators in the Java skiplist.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Jiang:2017:GAP,
  author =       "Lin Jiang and Zhijia Zhao",
  title =        "Grammar-aware Parallelization for Scalable {XPath}
                 Querying",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "371--383",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018772",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Semi-structured data emerge in many domains,
                 especially in web analytics and business intelligence.
                 However, querying such data is inherently sequential
                 due to the nested structure of input data. Existing
                 solutions pessimistically enumerate all execution paths
                 to circumvent dependencies, yielding sub-optimal
                 performance and limited scalability. This paper
                 presents GAP, a parallelization scheme that, for the
                 first time, leverages the grammar of the input data to
                 boost the parallelization efficiency. GAP leverages
                 static analysis to infer feasible execution paths for
                 specific contexts based on the grammar of the
                 semi-structured data. It can eliminate unnecessary
                 paths without compromising the correctness. In the
                 absence of a pre-defined grammar, GAP switches into a
                 speculative execution mode and takes potentially
                 incomplete grammar extracted either from prior inputs.
                 Together, the dual-mode GAP reduces the execution paths
                 from all paths to a minimum, therefore maximizing the
                 parallelization efficiency and scalability. The
                 benefits of path elimination go beyond reducing extra
                 computation --- it also enables the use of more
                 efficient data structures, which further improves the
                 efficiency. An evaluation on a large set of standard
                 benchmarks with diverse queries shows that GAP yields
                 significant efficiency increase and boosts the speedup
                 of the state-of-the-art from 2.9X to 17.6X on a 20-core
                 machine for a set of 200 queries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Wang:2017:ESC,
  author =       "Xin Wang and Weihua Zhang and Zhaoguo Wang and Ziyun
                 Wei and Haibo Chen and Wenyun Zhao",
  title =        "{Eunomia}: Scaling Concurrent Search Trees under
                 Contention Using {HTM}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "385--399",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018752",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "While hardware transactional memory (HTM) has recently
                 been adopted to construct efficient concurrent search
                 tree structures, such designs fail to deliver scalable
                 performance under contention. In this paper, we first
                 conduct a detailed analysis on an HTM-based concurrent
                 B+Tree, which uncovers several reasons for excessive
                 HTM aborts induced by both false and true conflicts
                 under contention. Based on the analysis, we advocate
                 Eunomia, a design pattern for search trees which
                 contains several principles to reduce HTM aborts,
                 including splitting HTM regions with version-based
                 concurrency control to reduce HTM working sets,
                 partitioned data layout to reduce false conflicts,
                 proactively detecting and avoiding true conflicts, and
                 adaptive concurrency control. To validate their
                 effectiveness, we apply such designs to construct a
                 scalable concurrent B+Tree using HTM. Evaluation using
                 key--value store benchmarks on a 20-core HTM-capable
                 multi-core machine shows that Eunomia leads to 5x--11x
                 speedup under high contention, while incurring small
                 overhead under low contention.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Tang:2017:SCM,
  author =       "Xiongchao Tang and Jidong Zhai and Bowen Yu and
                 Wenguang Chen and Weimin Zheng",
  title =        "Self-Checkpoint: an In-Memory Checkpoint Method Using
                 Less Space and Its Practice on Fault-Tolerant {HPL}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "401--413",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018745",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Fault tolerance is increasingly important in high
                 performance computing due to the substantial growth of
                 system scale and decreasing system reliability.
                 In-memory/diskless checkpoint has gained extensive
                 attention as a solution to avoid the IO bottleneck of
                 traditional disk-based checkpoint methods. However,
                 applications using previous in-memory checkpoint suffer
                 from little available memory space. To provide high
                 reliability, previous in-memory checkpoint methods
                 either need to keep two copies of checkpoints to
                 tolerate failures while updating old checkpoints or
                 trade performance for space by flushing in-memory
                 checkpoints into disk. In this paper, we propose a
                 novel in-memory checkpoint method, called
                 self-checkpoint, which can not only achieve the same
                 reliability of previous in-memory checkpoint methods,
                 but also increase the available memory space for
                 applications by almost 50\%. To validate our method, we
                 apply the self-checkpoint to an important problem,
                 fault tolerant HPL. We implement a scalable and fault
                 tolerant HPL based on this new method, called SKT-HPL,
                 and validate it on two large-scale systems.
                 Experimental results with 24,576 processes show that
                 SKT-HPL achieves over 95\% of the performance of the
                 original HPL. Compared to the state-of-the-art
                 in-memory checkpoint method, it improves the available
                 memory size by 47\% and the performance by 5\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Wu:2017:SDC,
  author =       "Panruo Wu and Nathan DeBardeleben and Qiang Guan and
                 Sean Blanchard and Jieyang Chen and Dingwen Tao and Xin
                 Liang and Kaiming Ouyang and Zizhong Chen",
  title =        "Silent Data Corruption Resilient Two-sided Matrix
                 Factorizations",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "415--427",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3018750",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents an algorithm based fault tolerance
                 method to harden three two-sided matrix factorizations
                 against soft errors: reduction to Hessenberg form,
                 tridiagonal form, and bidiagonal form. These two sided
                 factorizations are usually the prerequisites to
                 computing eigenvalues/eigenvectors and singular value
                 decomposition. Algorithm based fault tolerance has been
                 shown to work on three main one-sided matrix
                 factorizations: LU, Cholesky, and QR, but extending it
                 to cover two sided factorizations is non-trivial
                 because there are no obvious {\it offline, problem}
                 specific maintenance of checksums. We thus develop an
                 {\it online, algorithm} specific checksum scheme and
                 show how to systematically adapt the two sided
                 factorization algorithms used in LAPACK and ScaLAPACK
                 packages to introduce the algorithm based fault
                 tolerance. The resulting ABFT scheme can detect and
                 correct arithmetic errors {\it continuously} during the
                 factorizations that allow timely error handling.
                 Detailed analysis and experiments are conducted to show
                 the cost and the gain in resilience. We demonstrate
                 that our scheme covers a significant portion of the
                 operations of the factorizations. Our checksum scheme
                 achieves high error detection coverage and error
                 correction coverage compared to the state of the art,
                 with low overhead and high scalability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Arbel-Raviv:2017:PRD,
  author =       "Maya Arbel-Raviv and Trevor Brown",
  title =        "{Poster}: Reuse, don't Recycle: Transforming
                 Algorithms that Throw Away Descriptors",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "429--430",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019035",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Lock-free algorithms guarantee progress by having
                 threads help one another. Complex lock-free operations
                 facilitate helping by creating descriptor objects that
                 describe how other threads should help them. In many
                 lock-free algorithms, a new descriptor is allocated for
                 each operation. After an operation completes, its
                 descriptor must be reclaimed by a memory reclamation
                 scheme. Allocating and reclaiming descriptors
                 introduces significant space and time overhead. We
                 present a transformation for a class of lock-free
                 algorithms that allows each thread to efficiently reuse
                 a single descriptor. Experiments on a variety of
                 workloads show that our transformation yields
                 significant improvements over implementations that
                 reclaim descriptors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Balaji:2017:PAP,
  author =       "Vignesh Balaji and Dhruva Tirumala and Brandon Lucia",
  title =        "{Poster}: an Architecture and Programming Model for
                 Accelerating Parallel Commutative Computations via
                 Privatization",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "431--432",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019030",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Synchronization and data movement are the key
                 impediments to an efficient parallel execution. To
                 ensure that data shared by multiple threads remain
                 consistent, the programmer must use synchronization
                 (e.g., mutex locks) to serialize threads' accesses to
                 data. This limits parallelism because it forces threads
                 to sequentially access shared resources. Additionally,
                 systems use cache coherence to ensure that processors
                 always operate on the most up-to-date version of a
                 value even in the presence of private caches. Coherence
                 protocol implementations cause processors to serialize
                 their accesses to shared data, further limiting
                 parallelism and performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Bhattacharyya:2017:PHE,
  author =       "Arnamoy Bhattacharyya and Mike Dai Wang and Mihai
                 Burcea and Yi Ding and Allen Deng and Sai Varikooty and
                 Shafaaf Hossain and Cristiana Amza",
  title =        "{Poster}: {HythTM}: Extending the Applicability of
                 {Intel TSX} Hardware Transactional Support",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "433--434",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019027",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this work, we introduce and experimentally evaluate
                 a new hybrid software-hardware Transactional Memory
                 prototype based on Intel's Haswell TSX architecture.
                 Our prototype extends the applicability of the existing
                 hardware support for TM by interposing a hybrid
                 fall-back layer before the sequential, big-lock
                 fall-back path, used by standard TSX-supported
                 solutions in order to guarantee progress. In our
                 experimental evaluation we use SynQuake, a realistic
                 game benchmark modeled after Quake. Our results show
                 that our hybrid transactional system,which we call
                 HythTM, is able to reduce the number of transactions
                 that go to the sequential software layer, hence
                 avoiding hardware transaction aborts and loss of
                 parallelism. HythTM optimizes application throughput
                 and scalability up to 5.05x, when compared to the
                 hardware TM with sequential fall-back path.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Chowdhury:2017:PPE,
  author =       "Rezaul Chowdhury and Pramod Ganapathi and Yuan Tang
                 and Jesmin Jahan Tithi",
  title =        "{Poster}: Provably Efficient Scheduling of
                 Cache-Oblivious Wavefront Algorithms",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "435--436",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019031",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Standard cache-oblivious recursive divide-and-conquer
                 algorithms for evaluating dynamic programming
                 recurrences have optimal serial cache complexity but
                 often have lower parallelism compared with iterative
                 wavefront algorithms due to artificial dependencies
                 among subtasks. Very recently cache-oblivious recursive
                 wavefront (COW) algorithms have been introduced which
                 do not have any artificial dependencies. Though COW
                 algorithms are based on fork-join primitives, they
                 extensively use atomic operations, and as a result,
                 performance guarantees provided by state-of-the-art
                 schedulers for programs with fork-join primitives do
                 not apply. In this work, we show how to systematically
                 transform standard cache-oblivious recursive
                 divide-and-conquer algorithms into recursive wavefront
                 algorithms to achieve optimal parallel cache complexity
                 and high parallelism under state-of-the-art schedulers
                 for fork-join programs. Unlike COW algorithms these new
                 algorithms do not use atomic operations. Instead, they
                 use closed-form formulas to compute at what time each
                 recursive function must be launched in order to achieve
                 high parallelism without losing cache performance. The
                 resulting implementations are arguably much simpler
                 than implementations of known COW algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Cohen:2017:PST,
  author =       "Nachshon Cohen and Maurice Herlihy and Erez Petrank
                 and Elias Wald",
  title =        "{Poster}: State Teleportation via Hardware
                 Transactional Memory",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "437--438",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019026",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "State teleportation is a new technique for exploiting
                 hardware transactional memory (HTM) to improve existing
                 synchronization and memory management schemes for
                 highly-concurrent data structures. When applied to
                 fine-grained locking, a thread holding the lock for a
                 node launches a hardware transaction that traverses
                 multiple successor nodes, acquires the lock for the
                 last node reached, and releases the lock on the
                 starting node, skipping lock acquisitions for
                 intermediate nodes. When applied to lock-free data
                 structures, a thread visiting a node protected by a
                 hazard pointer launches a hardware transaction that
                 traverses multiple successor nodes, and publishes the
                 hazard pointer only for the last node reached, skipping
                 the memory barriers needed to publish intermediate
                 hazard pointers. Experimental results show that these
                 applications of state teleportation can substantially
                 increase the performance of both lock-based and
                 lock-free data structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Dai:2017:PII,
  author =       "Dong Dai and Wei Zhang and Yong Chen",
  title =        "{Poster}: {IOGP}: an Incremental Online Graph
                 Partitioning for Large-Scale Distributed Graph
                 Databases",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "439--440",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019037",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Large-scale graphs are becoming critical in various
                 domains such as social network, scientific application,
                 knowledge discovery, and even system software, etc.
                 Many of those use cases require large-scale
                 high-performance graph databases, which are designed
                 for serving continuous updates from the clients, and at
                 the same time, answering complex queries towards the
                 current graph in an on-line manner. Those operations in
                 graph databases, also referred as OLTP (online
                 transaction processing) operations, need specific
                 design and implementation in graph partitioning
                 algorithms. In this study, we designed an incremental
                 online graph partitioning (IOGP), optimized for OLTP
                 workloads. It is designed to achieve better locality,
                 generate balanced partitions, and increase the
                 parallelism for accessing hotspots of the graph. Our
                 evaluation results on both real world and synthetic
                 graphs in both simulation and real system confirm a
                 better performance on graph queries (as much as 2X)
                 with small overheads during graph insertion (less than
                 10\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Firoz:2017:PDC,
  author =       "Jesun Shariar Firoz and Thejaka Amila Kanewala and
                 Marcin Zalewski and Martina Barnas and Andrew
                 Lumsdaine",
  title =        "{Poster}: Distributed Control: The Benefits of
                 Eliminating Global Synchronization via Effective
                 Scheduling",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "441--442",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019036",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In distributed computing, parallel overheads such as
                 \emph{synchronization overhead} may hinder performance.
                 We introduce the idea of \emph{Distributed Control}
                 (DC) where global synchronization is reduced to
                 \emph{termination detection} and each worker proceeds
                 ahead optimistically, based on the local knowledge of
                 the global computation. To avoid ``wasted'' work, \DC
                 relies on local work prioritization. However, the work
                 order obtained by local prioritization is susceptible
                 to interference from the runtime. We show that
                 employing effective scheduling policies and
                 optimizations in the runtime, in conjunction with
                 eliminating global barriers, improves performance in
                 two graph applications: single-source shortest paths
                 and connected components.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Jo:2017:PMA,
  author =       "Gangwon Jo and Jaehoon Jung and Jiyoung Park and
                 Jaejin Lee",
  title =        "{Poster}: {MAPA}: an Automatic Memory Access Pattern
                 Analyzer for {GPU} Applications",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "443--444",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019034",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Various existing optimization and memory consistency
                 management techniques for GPU applications rely on
                 memory access patterns of kernels. However, they suffer
                 from poor practicality because they require explicit
                 user interventions to extract kernel memory access
                 patterns. This paper proposes an automatic
                 memory-access-pattern analysis framework called MAPA.
                 MAPA is based on a source-level analysis technique
                 derived from traditional symbolic analyses and a
                 run-time pattern selection technique. The experimental
                 results show that MAPA properly analyzes 116 real-world
                 OpenCL kernels from Rodinia and Parboil.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Li:2017:PCO,
  author =       "Shigang Li and Yunquan Zhang and Torsten Hoefler",
  title =        "{Poster}: Cache-Oblivious {MPI} All-to-All
                 Communications on Many-Core Architectures",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "445--446",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019025",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the many-core era, the performance of MPI
                 collectives is more dependent on the intra-node
                 communication component. However, the communication
                 algorithms generally inherit from the inter-node
                 version and ignore the cache complexity. We propose
                 cache-oblivious algorithms for MPI all-to-all
                 operations, in which data blocks are copied into the
                 receive buffers in Morton order to exploit data
                 locality. Experimental results on different many-core
                 architectures show that our cache-oblivious
                 implementations significantly outperform the naive
                 implementations based on shared heap and the highly
                 optimized MPI libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Menon:2017:PAL,
  author =       "Harshitha Menon and Kavitha Chandrasekar and Laxmikant
                 V. Kale",
  title =        "{Poster}: Automated Load Balancer Selection Based on
                 Application Characteristics",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "447--448",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019033",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many HPC applications require dynamic load balancing
                 to achieve high performance and system utilization.
                 Different applications have different characteristics
                 and hence require different load balancing strategies.
                 Invocation of a suboptimal load balancing strategy can
                 lead to inefficient execution. We propose
                 Meta-Balancer, a framework to automatically decide the
                 best load balancing strategy. It employs randomized
                 decision forests, a machine learning method, to learn a
                 model for choosing the best load balancing strategy for
                 an application represented by a set of features that
                 capture the application characteristics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Moscovici:2017:PGF,
  author =       "Nurit Moscovici and Nachshon Cohen and Erez Petrank",
  title =        "{Poster}: a {GPU}-Friendly Skiplist Algorithm",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "449--450",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019032",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a design for a fine-grained lock-based
                 skiplist optimized for Graphics Processing Units
                 (GPUs). While GPUs are often used to accelerate
                 streaming parallel computations, it remains a
                 significant challenge to efficiently offload concurrent
                 computations with more complicated data-irregular
                 access and fine-grained synchronization. Natural
                 building blocks for such computations would be
                 concurrent data structures, such as skiplists, which
                 are widely used in general purpose computations. Our
                 design utilizes array-based nodes which are accessed
                 and updated by warp-cooperative functions, thus taking
                 advantage of the fact that GPUs are most efficient when
                 memory accesses are coalesced and execution divergence
                 is minimized. The proposed design has been implemented,
                 and measurements demonstrate improved performance of up
                 to 2.6x over skiplist designs for the GPU existing
                 today.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Ramalhete:2017:PPM,
  author =       "Pedro Ramalhete and Andreia Correia",
  title =        "{Poster}: Poor Man's {URCU}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "451--452",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019021",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "RCU is, among other things, a well known mechanism for
                 memory reclamation that is meant to be used in
                 languages without an automatic Garbage Collector,
                 unfortunately, it requires operating system support,
                 which is currently provided only in Linux. An
                 alternative is to use Userspace RCU (URCU) which has
                 two variants that can be deployed on other operating
                 systems, named \emph{Memory Barrier} and \emph{Bullet
                 Proof}. We present a novel algorithm that implements
                 the three core APIs of RCU: \texttt{rcu\_read\_lock()},
                 \texttt{rcu\_read\_unlock()}, and
                 \texttt{synchronize\_rcu()}. Our algorithm uses one
                 mutual exclusion lock and two reader-writer locks with
                 \texttt{trylock()} capabilities, which means it does
                 not need a language with a memory model or atomics API,
                 and as such, it can be easily implemented in almost any
                 language, regardless of the underlying CPU
                 architecture, or operating system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Ramalhete:2017:PWF,
  author =       "Pedro Ramalhete and Andreia Correia",
  title =        "{Poster}: a Wait-Free Queue with Wait-Free Memory
                 Reclamation",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "453--454",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019022",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Queues are a widely deployed data structure. They are
                 used extensively in many multi threaded applications,
                 or as a communication mechanism between threads or
                 processes. We propose a new linearizable
                 multi-producer-multi-consumer queue we named Turn
                 queue, with wait-free progress bounded by the number of
                 threads, and with wait-free bounded memory reclamation.
                 Its main characteristics are: a simple algorithm that
                 does no memory allocation apart from creating the node
                 that is placed in the queue, a new wait-free consensus
                 algorithm using only the atomic instruction
                 compare-and-swap (CAS), and is easy to plugin with
                 other algorithms for either enqueue or dequeue
                 methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Tang:2017:PSS,
  author =       "Yuan Tang and Ronghui You",
  title =        "{Poster}: {STAR} (Space-Time Adaptive and Reductive)
                 Algorithms for Real-World Space-Time Optimality",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "455--456",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019029",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It's important to hit a space-time balance for a
                 real-world algorithm to achieve high performance on
                 modern shared-memory multi-core or many-core systems.
                 However, a large class of dynamic programs with more
                 than $ O(1) $ dependency achieve optimality either in
                 space or time, but not both. In the literature, the
                 problem is known as the fundamental space-time
                 tradeoff. By exploiting properly on the runtime system,
                 we show that our STAR (Space-Time Adaptive and
                 Reductive) technique can help these dynamic programs to
                 achieve sublinear parallel time bounds while still
                 maintaining work-, space-, and cache-optimality in a
                 processor- and cache-oblivious fashion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Wu:2017:PRP,
  author =       "Mingyu Wu and Haibing Guan and Binyu Zang and Haibo
                 Chen",
  title =        "{Poster}: Recovering Performance for Vector-based
                 Machine Learning on Managed Runtime",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "457--458",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019039",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Zhang:2017:PPC,
  author =       "Minjia Zhang and Swarnendu Biswas and Michael D.
                 Bond",
  title =        "{Poster}: On the Problem of Consistency Exceptions in
                 the Context of Strong Memory Models",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "459--460",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019024",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This work considers the problem of availability for
                 memory models that throw consistency exceptions. We
                 define a new memory model called RIx based on isolation
                 of synchronization-free regions and a new approach
                 called Avalon that provides RIx. Our evaluation shows
                 that Avalon and RIx substantially reduce consistency
                 exceptions, by 1-3 orders of magnitude and sometimes
                 eliminate them completely. Furthermore, our exploration
                 provides new, compelling points in the
                 performance-availability tradeoff space.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Zhao:2017:PIH,
  author =       "Yue Zhao and Chunhua Liao and Xipeng Shen",
  title =        "{Poster}: an Infrastructure for {HPC} Knowledge
                 Sharing and Reuse",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "8",
  pages =        "461--462",
  month =        aug,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3155284.3019023",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:12 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a prototype infrastructure for
                 addressing the barriers for effective accumulation,
                 sharing, and reuse of the various types of knowledge
                 for high performance parallel computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '17 conference proceedings.",
}

@Article{Shen:2017:BGB,
  author =       "Xipeng Shen",
  title =        "Bridging the gap between memory performance and
                 massive parallelism: the critical role of programming
                 systems innovations (keynote)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "1--1",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This talk examines some trends in the modern
                 developments of memory systems and their relations with
                 the massive parallelism in processors and applications.
                 It then draws on some recent work on GPU to explain the
                 important role of programming systems in bridging the
                 gap; it particularly emphasizes the importance of
                 innovations for enabling better software
                 controllability, more software elasticity, and
                 inter-thread data locality enhancements. The talk
                 further discusses the implications brought to
                 programming systems by the increasingly blurred
                 boundaries among memory, storage, and processing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Bruno:2017:NPG,
  author =       "Rodrigo Bruno and Lu{\'\i}s Picciochi Oliveira and
                 Paulo Ferreira",
  title =        "{NG2C}: pretenuring garbage collection with dynamic
                 generations for {HotSpot} big data applications",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "2--13",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092272",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Big Data applications suffer from unpredictable and
                 unacceptably high pause times due to Garbage Collection
                 (GC). This is the case in latency-sensitive
                 applications such as on-line credit-card fraud
                 detection, graph-based computing for analysis on social
                 networks, etc. Such pauses compromise latency
                 requirements of the whole application stack and result
                 from applications' aggressive buffering/caching of
                 data, exposing an ill-suited GC design, which assumes
                 that most objects will die young and does not consider
                 that applications hold large amounts of middle-lived
                 data in memory. To avoid such pauses, we propose NG2C,
                 a new GC algorithm that combines pretenuring with
                 user-defined dynamic generations. By being able to
                 allocate objects into different generations, NG2C is
                 able to group objects with similar lifetime profiles in
                 the same generation. By allocating objects with similar
                 lifetime profiles close to each other, i.e. in the same
                 generation, we avoid object promotion (copying between
                 generations) and heap fragmentation (which leads to
                 heap compactions) both responsible for most of the
                 duration of HotSpot GC pause times. NG2C is implemented
                 for the OpenJDK 8 HotSpot Java Virtual Machine, as an
                 extension of the Garbage First GC. We evaluate NG2C
                 using Cassandra, Lucene, and GraphChi with three
                 different GCs: Garbage First (G1), Concurrent Mark
                 Sweep (CMS), and NG2C. Results show that NG2C decreases
                 the worst observable GC pause time by up to 94.8\% for
                 Cassandra, 85.0\% for Lucene and 96.45\% for GraphChi,
                 when compared to current collectors (G1 and CMS). In
                 addition, NG2c has no negative impact on application
                 throughput or memory usage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Yang:2017:TAA,
  author =       "Albert Mingkun Yang and Tobias Wrigstad",
  title =        "Type-assisted automatic garbage collection for
                 lock-free data structures",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "14--24",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092274",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce Isolde, an automatic garbage collection
                 scheme designed specifically for managing memory in
                 lock-free data structures, such as stacks, lists, maps
                 and queues. Isolde exists as a plug-in memory manager,
                 designed to sit on-top of another memory manager, and
                 use it's allocator and reclaimer (if exists). Isolde
                 treats a lock-free data structure as a logical heap,
                 isolated from the rest of the program. This allows
                 garbage collection outside of Isolde to take place
                 without affecting the lock-free data structure. Isolde
                 further manages objects allocated on a Isolde heap in a
                 fully concurrent manner, allowing garbage collection to
                 incrementally remove garbage without stopping other
                 threads doing work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Vrvilo:2017:MDF,
  author =       "Nick Vrvilo and Lechen Yu and Vivek Sarkar",
  title =        "A marshalled data format for pointers in relocatable
                 data blocks",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "25--35",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092276",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As future computing hardware progresses towards
                 extreme-scale technology, new challenges arise for
                 addressing heterogeneous compute and memory resources,
                 for providing application resilience in the presence of
                 more frequent failures, and for working within strict
                 energy constraints. While C++ has gained popularity in
                 recent years within the HPC community, some concepts of
                 object-oriented program design may be at odds with the
                 techniques we use to address the challenges of
                 extreme-scale computing. In this work, we focus on the
                 challenges related to using aggregate data structures
                 that include pointer values within a programming model
                 where the runtime may frequently relocate data, and
                 traditional serialization techniques are not practical.
                 We propose and evaluate a marshalled encoding for
                 relocatable data blocks, and present a C++ library and
                 other tools to simplify the work of the application
                 programmer developing new applications or porting
                 existing applications to such emerging programming
                 models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Liu:2017:FEM,
  author =       "Zhengyang Liu and John Criswell",
  title =        "Flexible and efficient memory object metadata",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "36--46",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092268",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Compiler-based tools can protect software from attack
                 and find bugs within programs. To support programs
                 written in type-unsafe languages such as C, such tools
                 need to add code into a program that must, at run-time,
                 take a pointer into a memory object and locate metadata
                 for that memory object. Current methods of locating
                 metadata are either flexible (supporting metadata of
                 varying sizes) at the expense of speed and scalability
                 or are fast (e.g., by using shadow tables) at the cost
                 of flexibility (metadata is small and must always be
                 the same size). This paper presents a new method of
                 attaching metadata to memory objects, named Padding
                 Area MetaData (PAMD), that is both flexible and
                 efficient. Metadata can be any size, and different
                 memory objects can have different sized metadata. While
                 flexible, the algorithm for finding the metadata given
                 a pointer into the memory object takes constant time.
                 Our method extends Baggy Bounds with Accurate Checking
                 (BBAC) which attaches constant-sized metadata to memory
                 objects for performing precise dynamic bounds checks.
                 Our design supports variable-sized metadata, and our
                 implementation supports larger programs. We evaluated
                 the performance and scalability of PAMD using dynamic
                 bounds checking as an exemplar of our method. Our
                 results show that our method adds at most 33\% overhead
                 to an identical dynamic bounds checking tool that
                 trades precision for performance by using a simple
                 shadow table. Our results also show that our method,
                 while having the same flexibility as splay trees,
                 performs significantly faster and scales better as a
                 program allocates more memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Vorobyov:2017:SSE,
  author =       "Kostyantyn Vorobyov and Julien Signoles and Nikolai
                 Kosmatov",
  title =        "Shadow state encoding for efficient monitoring of
                 block-level properties",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "47--58",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092269",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Memory shadowing associates addresses from an
                 application's memory to values stored in a disjoint
                 memory space called shadow memory. At runtime shadow
                 values store metadata about application memory
                 locations they are mapped to. Shadow state encodings
                 --- the structure of shadow values and their
                 interpretation --- vary across different tools.
                 Encodings used by the state-of-the-art monitoring tools
                 have been proven useful for tracking memory at a
                 byte-level, but cannot address properties related to
                 memory block boundaries. Tracking block boundaries is
                 however crucial for spatial memory safety analysis,
                 where a spatial violation such as out-of-bounds access,
                 may dereference an allocated location belonging to an
                 adjacent block or a different struct member. This paper
                 describes two novel shadow state encodings which
                 capture block-boundary-related properties. These
                 encodings have been implemented in E-ACSL --- a runtime
                 verification tool for C programs. Initial experiments
                 involving checking validity of pointer and array
                 accesses in computationally intensive runs of programs
                 selected from SPEC CPU benchmarks demonstrate runtime
                 and memory overheads comparable to state-of-the-art
                 memory debuggers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Dashti:2017:AMM,
  author =       "Mohammad Dashti and Alexandra Fedorova",
  title =        "Analyzing memory management methods on integrated
                 {CPU--GPU} systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "59--69",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092256",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous systems that integrate a multicore CPU
                 and a GPU on the same die are ubiquitous. On these
                 systems, both the CPU and GPU share the same physical
                 memory as opposed to using separate memory dies.
                 Although integration eliminates the need to copy data
                 between the CPU and the GPU, arranging transparent
                 memory sharing between the two devices can carry large
                 overheads. Memory on CPU/GPU systems is typically
                 managed by a software framework such as OpenCL or CUDA,
                 which includes a runtime library, and communicates with
                 a GPU driver. These frameworks offer a range of memory
                 management methods that vary in ease of use,
                 consistency guarantees and performance. In this study,
                 we analyze some of the common memory management methods
                 of the most widely used software frameworks for
                 heterogeneous systems: CUDA, OpenCL 1.2, OpenCL 2.0,
                 and HSA, on NVIDIA and AMD hardware. We focus on
                 performance/functionality trade-offs, with the goal of
                 exposing their performance impact and simplifying the
                 choice of memory management methods for programmers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Giles:2017:CCH,
  author =       "Ellis Giles and Kshitij Doshi and Peter Varman",
  title =        "Continuous checkpointing of {HTM} transactions in
                 {NVM}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "70--81",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092270",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper addresses the challenges of coupling byte
                 addressable non-volatile memory (NVM) and hardware
                 transaction memory (HTM) in high-performance
                 transaction processing. We first show that HTM
                 transactions can be ordered using existing processor
                 instructions without any hardware changes. In contrast,
                 existing solutions posit changes to HTM mechanisms in
                 the form of special instructions or modified
                 functionality. We exploit the ordering mechanism to
                 design a novel persistence method that decouples HTM
                 concurrency from back-end NVM operations. Failure
                 atomicity is achieved using redo logging coupled with
                 aliasing to guard against mistimed cache evictions. Our
                 algorithm uses efficient lock-free mechanisms with
                 bounded static memory requirements. We evaluated our
                 approach using both micro-benchmarks, and, benchmarks
                 in the STAMP suite, and showed that it compares well
                 with standard (volatile) HTM transactions. We also
                 showed that it yields significant gains in throughput
                 and latency in comparison with persistent transactional
                 locking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Peng:2017:RTD,
  author =       "Ivy Bo Peng and Roberto Gioiosa and Gokcen Kestor and
                 Pietro Cicotti and Erwin Laure and Stefano Markidis",
  title =        "{RTHMS}: a tool for data placement on hybrid memory
                 system",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "82--91",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092273",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Traditional scientific and emerging data analytics
                 applications require fast, power-efficient, large, and
                 persistent memories. Combining all these
                 characteristics within a single memory technology is
                 expensive and hence future supercomputers will feature
                 different memory technologies side-by-side. However, it
                 is a complex task to program hybrid-memory systems and
                 to identify the best object-to-memory mapping. We
                 envision that programmers will probably resort to use
                 default configurations that only require minimal
                 interventions on the application code or system
                 settings. In this work, we argue that intelligent,
                 fine-grained data placement can achieve higher
                 performance than default setups. We present an
                 algorithm for data placement on hybrid-memory systems.
                 Our algorithm is based on a set of single-object
                 allocation rules and global data placement decisions.
                 We also present RTHMS, a tool that implements our
                 algorithm and provides recommendations about the
                 object-to-memory mapping. Our experiments on a hybrid
                 memory system, an Intel Knights Landing processor with
                 DRAM and HBM, show that RTHMS is able to achieve higher
                 performance than the default configuration. We believe
                 that RTHMS will be a valuable tool for programmers
                 working on complex hybrid-memory systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Kanvar:2017:WNG,
  author =       "Vini Kanvar and Uday P. Khedker",
  title =        "``{What}'s in a name?'' going beyond allocation site
                 names in heap analysis",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "92--103",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092267",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A points-to analysis computes a sound abstraction of
                 heap memory conventionally using a name-based
                 abstraction that summarizes runtime memory by grouping
                 locations using the names of allocation sites: All
                 concrete heap locations allocated by the same statement
                 are grouped together. The locations in the same group
                 are treated alike i.e., a pointer to any one location
                 of the group is assumed to point to every location in
                 the group leading to an over-approximation of points-to
                 relations. We propose an access-based abstraction that
                 partitions each name-based group of locations into
                 equivalence classes at every program point using an
                 additional criterion of the sets of access paths
                 (chains of pointer indirections) reaching the locations
                 in the memory. The intuition is that the locations that
                 are both allocated and accessed alike should be grouped
                 into the same equivalence class. Since the access paths
                 in the memory could reach different locations at
                 different program points, our groupings change flow
                 sensitively unlike the name-based groupings. This
                 creates a more precise view of the memory.
                 Theoretically, it is strictly more precise than the
                 name-based abstraction except in some trivial cases;
                 practically it is far more precise. Our empirical
                 measurements show the benefits of our tool Access-Based
                 Heap Analyzer (ABHA) on SPEC CPU 2006 and heap
                 manipulating SV-COMP benchmarks. ABHA, which is field-,
                 flow-, and context-sensitive, scales to 20 kLoC and can
                 improve the precision even up to 99\% (in terms of the
                 number of aliases). Additionally, ABHA allows any
                 user-defined summarization of an access path to be
                 plugged in; we have implemented and evaluated four
                 summarization techniques. ABHA can also act as a
                 front-end to TVLA, a parametrized shape analyzer, in
                 order to automate its parametrization by generating
                 predicates that capture the program behaviour more
                 accurately.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Fang:2017:RHF,
  author =       "Bin Fang and Mihaela Sighireanu",
  title =        "A refinement hierarchy for free list memory
                 allocators",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "104--114",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092275",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Existing implementations of dynamic memory allocators
                 (DMA) employ a large spectrum of policies and
                 techniques. The formal specifications of these
                 techniques are quite complicated in isolation and very
                 complex when combined. Therefore, the formal reasoning
                 on a specific DMA implementation is difficult for
                 automatic tools and mostly single-use. This paper
                 proposes a solution to this problem by providing formal
                 models for a full class of DMA, the free list class. To
                 obtain manageable formal reasoning and reusable formal
                 models, we organize these models in a hierarchy ranked
                 by refinement relations. We prove the soundness of
                 models and refinement relations using an off-the-shelf
                 theorem prover. We demonstrate that our hierarchy is a
                 basis for an algorithm theory for the class of free
                 list DMA: it abstracts various existing implementations
                 of DMA and leads to new DMA implementations. We
                 illustrate its application to model-based code
                 generation, testing, run-time verification, and static
                 analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Zhang:2017:ACE,
  author =       "Minjia Zhang and Swarnendu Biswas and Michael D.
                 Bond",
  title =        "Avoiding consistency exceptions under strong memory
                 models",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "9",
  pages =        "115--127",
  month =        sep,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156685.3092271",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Shared-memory languages and systems generally provide
                 weak or undefined semantics for executions with data
                 races. Prior work has proposed memory consistency
                 models that ensure well-defined, easy-to-understand
                 semantics based on region serializability (RS), but the
                 resulting system may throw a consistency exception in
                 the presence of a data race. Consistency exceptions can
                 occur unexpectedly even in well-tested programs,
                 hurting availability and thus limiting the practicality
                 of RS-based memory models. To our knowledge, this paper
                 is the first to consider the problem of availability
                 for memory consistency models that throw consistency
                 exceptions. We first extend existing approaches that
                 enforce RSx, a memory model based on serializability of
                 synchronization-free regions (SFRs), to avoid region
                 conflicts and thus consistency exceptions. These new
                 approaches demonstrate both the potential for and
                 limitations of avoiding consistency exceptions under
                 RSx. To improve availability further, we introduce (1)
                 a new memory model called RIx based on isolation of
                 SFRs and (2) a new approach called Avalon that provides
                 RIx. We demonstrate two variants of Avalon that offer
                 different performance--availability tradeoffs for RIx.
                 An evaluation on real Java programs shows that this
                 work's novel approaches are able to reduce consistency
                 exceptions, thereby improving the applicability of
                 strong memory consistency models. Furthermore, the
                 approaches provide compelling points in the
                 performance--availability tradeoff space for memory
                 consistency enforcement. RIx and Avalon thus represent
                 a promising direction for tackling the challenge of
                 availability under strong consistency models that throw
                 consistency exceptions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '17 conference proceedings.",
}

@Article{Remy:2017:OEP,
  author =       "Didier R{\'e}my",
  title =        "{Ornaments}: exploiting parametricity for safer, more
                 automated code refactorization and code reuse (invited
                 talk)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "1--1",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3127333",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "Inductive datatypes and parametric polymorphism are
                 two key features introduced in the ML family of
                 languages, which have already been widely exploited for
                 structuring programs: Haskell and ML programs are often
                 more elegant and more correct by construction. Still,
                 we sometimes need code to be refactored or adapted to
                 be reused in a slightly different context. While the
                 type system is considerably helpful in these
                 situations, by automatically locating type-inconsistent
                 program points or incomplete pattern matchings, this
                 process could be made safer and more automated by
                 further exploiting parametricity. We propose a
                 posteriori program abstraction as a principle for such
                 code transformations. We apply this principle to
                 ornamentation which is a way to describe changes in
                 datatype definitions reorganizing, adding, or dropping
                 some pieces of data so that functions operating on the
                 bare definition can be partially and sometimes totally
                 lifted into functions operating on the ornamented
                 structure. We view ornamentation as an a posteriori
                 abstraction of the bare code, called a generic lifting,
                 which can then be instantiated into a concrete lifting,
                 meta-reduced, and simplified. Both the source and
                 target code live in core ML while the lifted code lives
                 in a meta-language above ML equipped with a limited
                 form of dependent types needed to capture some
                 invariants of the generic lifting so that the concrete
                 lifting can be simplified back into an ML program.
                 Importantly, the lifted code can be closely related to
                 the bare code, using logical relations thanks to the
                 generic lifting detour. Different, typical use cases of
                 ornaments will be shown and the approach will be mainly
                 illustrated on examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Mokhov:2017:AGC,
  author =       "Andrey Mokhov",
  title =        "Algebraic graphs with class (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "2--13",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122956",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The paper presents a minimalistic and elegant approach
                 to working with graphs in Haskell. It is built on a
                 rigorous mathematical foundation --- an algebra of
                 graphs --- that allows us to apply equational reasoning
                 for proving the correctness of graph transformation
                 algorithms. Algebraic graphs let us avoid partial
                 functions typically caused by `malformed graphs' that
                 contain an edge referring to a non-existent vertex.
                 This helps to liberate APIs of existing graph libraries
                 from partial functions. The algebra of graphs can
                 represent directed, undirected, reflexive and
                 transitive graphs, as well as hypergraphs, by
                 appropriately choosing the set of underlying axioms.
                 The flexibility of the approach is demonstrated by
                 developing a library for constructing and transforming
                 polymorphic graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Blazevic:2017:PPP,
  author =       "Mario Blazevi{\'c} and Jacques L{\'e}gar{\'e}",
  title =        "Packrats parse in packs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "14--25",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122958",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a novel but remarkably simple formulation
                 of formal language grammars in Haskell as functions
                 mapping a record of production parsers to itself. Thus
                 formulated grammars are first-class objects, composable
                 and reusable. We also provide a simple parser
                 implementation for them, based on an improved packrat
                 algorithm. In order to make the grammar manipulation
                 code reusable, we introduce a set of type classes
                 mirroring the existing type classes from Haskell base
                 library, but whose methods have rank-2 types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Lampropoulos:2017:ORU,
  author =       "Leonidas Lampropoulos and Antal Spector-Zabusky and
                 Kenneth Foner",
  title =        "Ode on a random urn (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "26--37",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122959",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present the urn, a simple tree-based data structure
                 that supports sampling from and updating discrete
                 probability distributions in logarithmic time. We avoid
                 the usual complexity of traditional self-balancing
                 binary search trees by not keeping values in a specific
                 order. Instead, we keep the tree maximally balanced at
                 all times using a single machine word of overhead: its
                 size. Urns provide an alternative interface for the
                 frequency combinator from the QuickCheck library that
                 allows for asymptotically more efficient sampling from
                 dynamically-updated distributions. They also facilitate
                 backtracking in property-based random testing, and can
                 be applied to such complex examples from the literature
                 as generating well-typed lambda terms or information
                 flow machine states, demonstrating significant
                 speedups.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Algehed:2017:QLT,
  author =       "Maximilian Algehed and Koen Claessen and Moa Johansson
                 and Nick Smallbone",
  title =        "{QuickSpec}: a lightweight theory exploration tool for
                 programmers (system demonstration)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "38--39",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122960",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This document gives the outline of a system
                 demonstration for the QuickSpec theory exploration
                 tool.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Braquehais:2017:SDC,
  author =       "Rudy Braquehais and Colin Runciman",
  title =        "{Speculate}: discovering conditional equations and
                 inequalities about black-box functions by reasoning
                 from test results",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "40--51",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122961",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents Speculate, a tool that
                 automatically conjectures laws involving conditional
                 equations and inequalities about Haskell functions.
                 Speculate enumerates expressions involving a given
                 collection of Haskell functions, testing to separate
                 those expressions into apparent equivalence classes.
                 Expressions in the same equivalence class are used to
                 conjecture equations. Representative expressions of
                 different equivalence classes are used to conjecture
                 conditional equations and inequalities. Speculate uses
                 lightweight equational reasoning based on term
                 rewriting to discard redundant laws and to avoid
                 needless testing. Several applications demonstrate the
                 effectiveness of Speculate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Wiegley:2017:UCW,
  author =       "John Wiegley and Benjamin Delaware",
  title =        "Using {Coq} to write fast and correct {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "52--62",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122962",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Correctness and performance are often at odds in the
                 field of systems engineering, either because correct
                 programs are too costly to write or impractical to
                 execute, or because well-performing code involves so
                 many tricks of the trade that formal analysis is unable
                 to isolate the main properties of the algorithm. As a
                 prime example of this tension, Coq is an established
                 proof environment that allows writing correct,
                 dependently-typed code, but it has been criticized for
                 exorbitant development times, forcing the developer to
                 choose between optimal code or tractable proofs. On the
                 other side of the divide, Haskell has proven itself to
                 be a capable, well-typed programming environment, yet
                 easy-to-read, straightforward code must all too often
                 be replaced by highly optimized variants that obscure
                 the author's original intention. This paper builds on
                 the existing Fiat refinement framework to bridge this
                 divide, demonstrating how to derive a
                 correct-by-construction implementation that meets (or
                 exceeds) the performance characteristics of highly
                 optimized Haskell, starting from a high-level Coq
                 specification. To achieve this goal, we extend Fiat
                 with a stateful notion of refinement of abstract data
                 types and add support for extracting stateful code via
                 a free monad equipped with an algebra of
                 heap-manipulating operations. As a case study, we
                 reimplement a subset of the popular bytestring library,
                 with little to no loss of performance, while retaining
                 a high guarantee of program correctness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Vazou:2017:TTP,
  author =       "Niki Vazou and Leonidas Lampropoulos and Jeff
                 Polakow",
  title =        "A tale of two provers: verifying monoidal string
                 matching in liquid {Haskell} and {Coq}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "63--74",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122963",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "We demonstrate for the first time that Liquid Haskell,
                 a refinement type checker for Haskell programs, can be
                 used for arbitrary theorem proving by verifying a
                 parallel, monoidal string matching algorithm
                 implemented in Haskell. We use refinement types to
                 specify correctness properties, Haskell terms to
                 express proofs of these properties, and Liquid Haskell
                 to check the proofs. We evaluate Liquid Haskell as a
                 theorem prover by replicating our 1428 LoC proof in a
                 dependently-typed language (Coq --- 1136 LoC). Finally,
                 we compare both proofs, uncovering the relative
                 advantages and disadvantages of the two provers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Ekblad:2017:MED,
  author =       "Anton Ekblad",
  title =        "A meta-{EDSL} for distributed web applications",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "75--85",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122969",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a domain-specific language for constructing
                 and configuring web applications distributed across any
                 number of networked, heterogeneous systems. Our
                 language is embedded in Haskell, provides a common
                 framework for integrating components written in
                 third-party EDSLs, and enables type-safe,
                 access-controlled communication between nodes, as well
                 as effortless sharing and movement of functionality
                 between application components. We give an
                 implementation of our language and demonstrate its
                 applicability by using it to implement several
                 important components of distributed web applications,
                 including RDBMS integration, load balancing, and
                 fine-grained sandboxing of untrusted third party code.
                 The rising popularity of cloud computing and
                 heterogeneous computer architectures is putting a
                 strain on conventional programming models, which
                 commonly assume that one application executes on one
                 machine, or at best on one out of several identical
                 machines. With our language, we take the first step
                 towards a programming model better suited for a
                 computationally multicultural future.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Dawson:2017:CNS,
  author =       "Justin Dawson and Mark Grebe and Andy Gill",
  title =        "Composable network stacks and remote monads",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "86--97",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122968",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Monads and applicative functors are two ways that
                 Haskell programmers bundle effectful primitives into
                 effectful program fragments. In this paper, we
                 investigate using monads and applicative functors to
                 bundle remote effectful primitives, specifically aiming
                 to amortize the cost of remote communications using
                 bundling. We look at several ways of maximizing the
                 bundling of primitives, drawing from the remote monad
                 design pattern and Haxl system, and provide a taxonomy
                 of mechanism for amortization, with examples. The
                 result of this investigation is that monadic fragments
                 can be efficiently bundled into packets, almost for
                 free, when given a user-supplied packet transportation
                 mechanism, and the primitives obey some simple pre- and
                 post-conditions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Quick:2017:AMH,
  author =       "Donya Quick",
  title =        "Algorithmic music in {Haskell} (invited talk)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "98--98",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3127334",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional programming is becoming increasingly
                 popular in artistic areas such as algorithmic music
                 composition. Euterpea and Kulitta are two libraries for
                 working with music in Haskell. Euterpea is a library
                 for representing and manipulating basic musical
                 structures, and is useful both in a pedagogical setting
                 to teach functional programming through the arts and as
                 a tool to create complex pieces of algorithmic music.
                 Kulitta is a framework for automated composition that
                 addresses music at a more abstract level than Euterpea,
                 capturing aspects of musical style through geometric
                 models and probabilistic grammars. Both of these
                 libraries leverage Haskell's pure functional nature and
                 strong type system to achieve versatile, yet concise
                 designs that allow the creation of diverse and
                 interesting music. Features from these libraries have
                 also been integral in the design of newer systems for
                 natural language processing and artificial intelligence
                 in the musical domain. This talk will explore
                 challenges presented by creating these kinds of
                 domain-specific embedded languages for working with
                 music, and how taking functional approaches to them
                 yields elegant solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Szamozvancev:2017:WTM,
  author =       "Dmitrij Szamozvancev and Michael B. Gale",
  title =        "Well-typed music does not sound wrong (experience
                 report)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "99--104",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122964",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Music description and generation are popular use cases
                 for Haskell, ranging from live coding libraries to
                 automatic harmonisation systems. Some approaches use
                 probabilistic methods, others build on the theory of
                 Western music composition, but there has been little
                 work done on checking the correctness of musical pieces
                 in terms of voice leading, harmony, and structure.
                 Haskell's recent additions to the type-system now
                 enable us to perform such analysis statically. We
                 present our experience of implementing a type-level
                 model of classical music and an accompanying EDSL which
                 enforce the rules of classical music at compile-time,
                 turning composition mistakes into compiler errors.
                 Along the way, we discuss the strengths and limitations
                 of doing this in Haskell and demonstrate that the type
                 system of the language is fully capable of expressing
                 non-trivial and practical logic specific to a
                 particular domain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Perez:2017:BFT,
  author =       "Ivan Perez",
  title =        "Back to the future: time travel in {FRP}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "105--116",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122957",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional Reactive Programming (FRP) allows
                 interactive applications to be modelled in a
                 declarative manner using time-varying values. For
                 practical reasons, however, operational constraints are
                 often imposed, such as having a fixed time domain, time
                 always flowing forward, and limiting the exploration of
                 the past. In this paper we show how these constraints
                 can be overcome, giving local control over the time
                 domain, the direction of time and the sampling step. We
                 study the behaviour of FRP expressions when time flows
                 backwards, and demonstrate how to synchronize
                 subsystems running asynchronously and at different
                 sampling rates. We have verified the practicality of
                 our approach with two non-trivial games in which time
                 control is central to the gameplay.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Paykin:2017:LM,
  author =       "Jennifer Paykin and Steve Zdancewic",
  title =        "The Linearity Monad",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "117--132",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122965",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce a technique for programming with
                 domain-specific linear languages using the monad that
                 arises from the theory of linear/non-linear logic. In
                 this work we interpret the linear/non-linear model as a
                 simple, effectful linear language embedded inside an
                 existing non-linear host language. We implement a
                 modular framework for defining these linear EDSLs in
                 Haskell, allowing both shallow and deep embeddings. To
                 demonstrate the effectiveness of the framework and the
                 linearity monad, we implement languages for file
                 handles, mutable arrays, session types, and quantum
                 computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Karachalias:2017:EFD,
  author =       "Georgios Karachalias and Tom Schrijvers",
  title =        "Elaboration on functional dependencies: functional
                 dependencies are dead, long live functional
                 dependencies!",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "133--147",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122966",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Functional dependencies are a popular extension to
                 Haskell's type-class system because they provide
                 fine-grained control over type inference, resolve
                 ambiguities and even enable type-level computations.
                 Unfortunately, several aspects of Haskell's functional
                 dependencies are ill-understood. In particular, the GHC
                 compiler does not properly enforce the functional
                 dependency property, and rejects well-typed programs
                 because it does not know how to elaborate them into its
                 core language, System F$_C$. This paper presents a
                 novel formalization of functional dependencies that
                 addresses these issues: We explicitly capture the
                 functional dependency property in the type system, in
                 the form of explicit type equalities. We also provide a
                 type inference algorithm and an accompanying
                 elaboration strategy which allows all well-typed
                 programs to be elaborated into System F$_C$.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Bottu:2017:QCC,
  author =       "Gert-Jan Bottu and Georgios Karachalias and Tom
                 Schrijvers and Bruno C. d. S. Oliveira and Philip
                 Wadler",
  title =        "Quantified class constraints",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "148--161",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122967",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Quantified class constraints have been proposed many
                 years ago to raise the expressive power of type classes
                 from Horn clauses to the universal fragment of
                 Hereditary Harrop logic. Yet, while it has been much
                 asked for over the years, the feature was never
                 implemented or studied in depth. Instead, several
                 workarounds have been proposed, all of which are
                 ultimately stopgap measures. This paper revisits the
                 idea of quantified class constraints and elaborates it
                 into a practical language design. We show the merit of
                 quantified class constraints in terms of more
                 expressive modeling and in terms of terminating type
                 class resolution. In addition, we provide a declarative
                 specification of the type system as well as a type
                 inference algorithm that elaborates into System F.
                 Moreover, we discuss termination conditions of our
                 system and also provide a prototype implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Aronsson:2017:HSC,
  author =       "Markus Aronsson and Mary Sheeran",
  title =        "Hardware software co-design in {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "162--173",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122970",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a library in Haskell for programming Field
                 Programmable Gate Arrays (FPGAs), including hardware
                 software co-design. Code for software (in C) and
                 hardware (in VHDL) is generated from a single program,
                 along with the code to support communication between
                 hardware and software. We present type-based techniques
                 for the simultaneous implementation of more than one
                 embedded domain specific language (EDSL). We build upon
                 a generic representation of imperative programs that is
                 loosely coupled to instruction and expression types,
                 allowing the individual parts to be developed and
                 improved separately. Code generation is implemented as
                 a series of translations between progressively smaller,
                 typed EDSLs, safeguarding against errors that arise in
                 untyped translations. Initial case studies show
                 promising performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Clifton-Everest:2017:SIA,
  author =       "Robert Clifton-Everest and Trevor L. McDonell and
                 Manuel M. T. Chakravarty and Gabriele Keller",
  title =        "Streaming irregular arrays",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "174--185",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122971",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Previous work has demonstrated that it is possible to
                 generate efficient and highly parallel code for
                 multicore CPUs and GPUs from combinator-based array
                 languages for a range of applications. That work,
                 however, has been limited to operating on flat,
                 rectangular structures without any facilities for
                 irregularity or nesting. In this paper, we show that
                 even a limited form of nesting provides substantial
                 benefits both in terms of the expressiveness of the
                 language (increasing modularity and providing support
                 for simple irregular structures) and the portability of
                 the code (increasing portability across
                 resource-constrained devices, such as GPUs).
                 Specifically, we generalise Blelloch's flattening
                 transformation along two lines: (1) we explicitly
                 distinguish between definitely regular and potentially
                 irregular computations; and (2) we handle
                 multidimensional arrays. We demonstrate the utility of
                 this generalisation by an extension of the embedded
                 array language Accelerate to include irregular streams
                 of multidimensional arrays. We discuss code generation,
                 optimisation, and irregular stream scheduling as well
                 as a range of benchmarks on both multicore CPUs and
                 GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Yates:2017:ISP,
  author =       "Ryan Yates and Michael L. Scott",
  title =        "Improving {STM} performance with transactional
                 structs",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "186--196",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122972",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software transactional memory (STM) has made it
                 significantly easier to write correct concurrent
                 programs in Haskell. Its performance, however, is
                 limited by several inefficiencies. While safe
                 concurrent computations are easy to express in
                 Haskell's STM, concurrent data structures suffer
                 unfortunate bloat in the implementation due to an extra
                 level of indirection for mutable references as well as
                 the inability to express unboxed mutable transactional
                 values. We address these deficiencies by introducing
                 {$<$ pre$>$TStruct$<$}/{pre$>$} to the GHC run-time
                 system, allowing strict unboxed transactional values as
                 well as mutable references without an extra
                 indirection. Using {$<$ pre$>$TStruct$<$}/{pre$>$} we
                 implement several data structures, discuss their
                 design, and provide benchmark results on a large
                 multicore machine. Our benchmarks show that concurrent
                 data structures built with {$<$
                 pre$>$TStruct$<$}/{pre$>$} out-scale and out-perform
                 their {$<$ pre$>$TVar$<$}/{pre$>$}-based equivalents.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Chen:2017:ALF,
  author =       "Chao-Hong Chen and Vikraman Choudhury and Ryan R.
                 Newton",
  title =        "Adaptive lock-free data structures in {Haskell}: a
                 general method for concurrent implementation swapping",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "10",
  pages =        "197--211",
  month =        oct,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3156695.3122973",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A key part of implementing high-level languages is
                 providing built- in and default data structures. Yet
                 selecting good defaults is hard. A mutable data
                 structure's workload is not known in advance, and it
                 may shift over its lifetime --- e.g., between
                 read-heavy and write-heavy, or from heavy contention by
                 multiple threads to single-threaded or low-frequency
                 use. One idea is to switch implementations adaptively,
                 but it is nontrivial to switch the implementation of a
                 concurrent data structure at runtime. Performing the
                 transition requires a concurrent snapshot of data
                 structure contents, which normally demands special
                 engineering in the data structure's design. However, in
                 this paper we identify and formalize an relevant
                 property of lock-free algorithms. Namely, lock-freedom
                 is sufficient to guarantee that freezing memory
                 locations in an arbitrary order will result in a valid
                 snapshot. Several functional languages have data
                 structures that freeze and thaw, transitioning between
                 mutable and immutable, such as Haskell vectors and
                 Clojure transients, but these enable only
                 single-threaded writers. We generalize this approach to
                 augment an arbitrary lock-free data structure with the
                 ability to gradually freeze and optionally transition
                 to a new representation. This augmentation doesn't
                 require changing the algorithm or code for the data
                 structure, only replacing its datatype for mutable
                 references with a freezable variant. In this paper, we
                 present an algorithm for lifting plain to adaptive data
                 and prove that the resulting hybrid data structure is
                 itself lock-free, linearizable, and simulates the
                 original. We also perform an empirical case study in
                 the context of heating up and cooling down concurrent
                 maps.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '17 conference proceedings.",
}

@Article{Pizlo:2017:JVM,
  author =       "Filip Pizlo",
  title =        "The {JavaScriptCore} virtual machine (invited talk)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "1--1",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3148567",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "JavaScriptCore (JSC) is an open-source
                 high-performance implementation of JavaScript. JSC is
                 used in the WebKit open source browser engine as well
                 as a system framework on macOS and iOS. This talk will
                 give a broad high-level overview of JSC's
                 performance-oriented architecture, including specific
                 details about the object model, garbage collector,
                 optimizing compilers, type inference, and
                 deoptimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Siek:2017:CPT,
  author =       "Jeremy Siek",
  title =        "Challenges and progress toward efficient gradual
                 typing (invited talk)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "2--2",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3148570",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mixing static and dynamic type checking in the same
                 language is catching on, with the TypeScript and Flow
                 variants of JavaScript, the MyPy and Reticulated
                 variants of Python, the Strongtalk and Gradualtalk
                 variants of Smalltalk, as well as Typed Racket, Typed
                 Clojure, and Perl 6. The gradual typing approach to
                 such mixing seeks to protect the statically typed code
                 from the dynamically typed code, allowing compilers to
                 leverage type information when optimizing the static
                 code. Unfortunately, ensuring soundness requires
                 runtime checking at the boundaries of typed and untyped
                 code, and the cost of this checking can drown out the
                 performance benefits of optimization. For example, in
                 Typed Racket, some partially typed programs are 1000X
                 slower than the untyped or fully typed version of the
                 same program. But all is not lost! In this talk I
                 present the results of ongoing research to tame the
                 runtime overheads of gradual typing in the context of a
                 prototype compiler, named Grift, that we are developing
                 at Indiana University.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Marr:2017:CAP,
  author =       "Stefan Marr and Carmen Torres Lopez and Dominik Aumayr
                 and Elisa Gonzalez Boix and Hanspeter
                 M{\"o}ssenb{\"o}ck",
  title =        "A concurrency-agnostic protocol for multi-paradigm
                 concurrent debugging tools",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "3--14",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3133842",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Today's complex software systems combine high-level
                 concurrency models. Each model is used to solve a
                 specific set of problems. Unfortunately, debuggers
                 support only the low-level notions of threads and
                 shared memory, forcing developers to reason about these
                 notions instead of the high-level concurrency models
                 they chose. This paper proposes a concurrency-agnostic
                 debugger protocol that decouples the debugger from the
                 concurrency models employed by the target application.
                 As a result, the underlying language runtime can define
                 custom breakpoints, stepping operations, and execution
                 events for each concurrency model it supports, and a
                 debugger can expose them without having to be
                 specifically adapted. We evaluated the generality of
                 the protocol by applying it to SOMns, a Newspeak
                 implementation, which supports a diversity of
                 concurrency models including communicating sequential
                 processes, communicating event loops, threads and
                 locks, fork/join parallelism, and software
                 transactional memory. We implemented 21 breakpoints and
                 20 stepping operations for these concurrency models.
                 For none of these, the debugger needed to be changed.
                 Furthermore, we visualize all concurrent interactions
                 independently of a specific concurrency model. To show
                 that tooling for a specific concurrency model is
                 possible, we visualize actor turns and message sends
                 separately.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Ungar:2017:DAO,
  author =       "David Ungar and David Grove and Hubertus Franke",
  title =        "Dynamic atomicity: optimizing swift memory
                 management",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "15--26",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3133843",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Swift is a modern multi-paradigm programming language
                 with an extensive developer community and open source
                 ecosystem. Swift 3's memory management strategy is
                 based on Automatic Reference Counting (ARC) augmented
                 with unsafe APIs for manually-managed memory. We have
                 seen ARC consume as much as 80\% of program execution
                 time. A significant portion of ARC's direct performance
                 cost can be attributed to its use of atomic machine
                 instructions to protect reference count updates from
                 data races. Consequently, we have designed and
                 implemented dynamic atomicity, an optimization which
                 safely replaces atomic reference-counting operations
                 with nonatomic ones where feasible. The optimization
                 introduces a store barrier to detect possibly
                 intra-thread references, compiler-generated recursive
                 reference-tracers to find all affected objects, and a
                 bit of state in each reference count to encode its
                 atomicity requirements. Using a suite of 171
                 microbenchmarks, 9 programs from the Computer Language
                 Benchmarks Game, and the Richards benchmark, we
                 performed a limit study by unsafely making all
                 reference counting operations nonatomic. We measured
                 potential speedups of up to 220\% on the
                 microbenchmarks, 120\% on the Benchmarks Game and 70\%
                 on Richards. By automatically reducing ARC overhead,
                 our optimization both improves Swift 3's performance
                 and reduces the temptation for performance-oriented
                 programmers to resort to unsafe manual memory
                 management. Furthermore, the machinery implemented for
                 dynamic atomicity could also be employed to obtain
                 cheaper thread-safe Swift data structures, or to
                 augment ARC with optional cycle detection or a backup
                 tracing garbage collector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Infante:2017:OER,
  author =       "Alejandro Infante and Alexandre Bergel",
  title =        "Object equivalence: revisiting object equality
                 profiling (an experience report)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "27--38",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3133844",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern object-oriented programming languages greatly
                 alleviate the memory management for programmers.
                 Despite the efficiency of garbage collection and
                 Just-In-Time program analyzes, memory still remains
                 prone to be wasted. A bloated memory may have severe
                 consequences, including frequent execution lags due to
                 a high pressure on the garbage collector and suboptimal
                 object dependencies. We found that dynamically
                 monitoring object production sites and the equivalence
                 of the produced objects is key to identify wasted
                 memory consumption caused by redundant objects. We
                 implemented optimizations for reducing the memory
                 consumption of six applications, achieving a reduction
                 over 40\% in half of the applications without having
                 any prior knowledge of these applications. Our results
                 partially replicate the results obtained by Marinov and
                 O'Callahan and explore new ways to identify redundant
                 objects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Pimas:2017:GCE,
  author =       "Javier Pim{\'a}s and Javier Burroni and Jean Baptiste
                 Arnaud and Stefan Marr",
  title =        "Garbage collection and efficiency in dynamic
                 metacircular runtimes: an experience report",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "39--50",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3133845",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "In dynamic object-oriented languages, low-level
                 mechanisms such as just-in-time compilation, object
                 allocation, garbage collection (GC) and method dispatch
                 are often handled by virtual machines (VMs). VMs are
                 typically implemented using static languages, allowing
                 only few changes at run time. In such systems, the VM
                 is not part of the language and interfaces to memory
                 management or method dispatch are fixed, not allowing
                 for arbitrary adaptation. Furthermore, the
                 implementation can typically not be inspected or
                 debugged with standard tools used to work on
                 application code. This paper reports on our experience
                 building Bee, a dynamic Smalltalk runtime, written in
                 Smalltalk. Bee is a Dynamic Metacircular Runtime (DMR)
                 and seamlessly integrates the VM into the application
                 and thereby overcomes many restrictions of classic VMs,
                 for instance by allowing arbitrary code modifications
                 of the VM at run time. Furthermore, the approach
                 enables developers to use their standard tools for
                 application code also for the VM, allowing them to
                 inspect, debug, understand, and modify a DMR
                 seamlessly. We detail our experience of implementing
                 GC, compilation, and optimizations in a DMR. We discuss
                 examples where we found that DMRs can improve
                 understanding of the system, provide tighter control of
                 the software stack, and facilitate research. We also
                 show that the Bee DMR matches and surpass the
                 performance of a widely used Smalltalk VM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Loring:2017:SAJ,
  author =       "Matthew C. Loring and Mark Marron and Daan Leijen",
  title =        "Semantics of asynchronous {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "51--62",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3133846",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "JavaScript code running in the Node.js runtime is a
                 major platform for developers building cloud, mobile,
                 or IoT applications. A fundamental concept in Node.js
                 programming is the use of asynchronous callbacks and
                 event loops to provide highly responsive applications.
                 While conceptually simple, this programming model
                 contains numerous subtleties and behaviors that are
                 defined implicitly by the current Node.js
                 implementation. This paper presents the first
                 comprehensive formalization of the Node.js asynchronous
                 execution model and defines a high-level notion of
                 async-contexts to formalize fundamental relationships
                 between asynchronous executions in an application.
                 These formalizations provide a foundation for the
                 construction of static or dynamic program analysis
                 tools, support the exploration of alternative Node.js
                 event loop implementations, and provide a high-level
                 conceptual framework for reasoning about relationships
                 between the execution of asynchronous callbacks in a
                 Node.js application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Vergu:2017:SNR,
  author =       "Vlad Vergu and Michiel Haisma and Eelco Visser",
  title =        "The semantics of name resolution in {Grace}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "63--74",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3133847",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Grace is a dynamic object oriented programming
                 language designed to aid programming education. We
                 present a formal model of and give an operational
                 semantics for its object model and name resolution
                 algorithm. Our main contributions are a systematic
                 model of Grace's name resolution using scope graphs,
                 relating linguistic features to other languages, and an
                 operationalization of this model in the form of an
                 operational semantics which is readable and executable.
                 The semantics are extensively tested against a
                 reference Grace implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Soldevila:2017:DLF,
  author =       "Mallku Soldevila and Beta Ziliani and Bruno Silvestre
                 and Daniel Fridlender and Fabio Mascarenhas",
  title =        "Decoding {Lua}: formal semantics for the developer and
                 the semanticist",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "11",
  pages =        "75--86",
  month =        nov,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170472.3133848",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:13 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We provide formal semantics for a large subset of the
                 Lua programming language, in its version 5.2. We
                 validate our model by mechanizing it and testing it
                 against the test suite of the reference interpreter of
                 Lua, obtaining evidence that our model accurately
                 represents the language. We target both a PL
                 semanticist --- not necessarily versed in Lua --- , and
                 a Lua developer --- not necessarily versed in semantic
                 frameworks. To the former, we present the peculiarities
                 of the language, and how we model them in a modular
                 small-step operational semantics, using concepts from
                 Felleisen-Hieb's reduction semantics with evaluation
                 contexts. Moreover, we mechanize and test the model in
                 PLT Redex, the de facto tool for reduction semantics.
                 To the reader unfamiliar with such concepts, we provide
                 a gentle introduction to the model. It is our hope that
                 developers of the different Lua implementations and
                 dialects understand the model and consider it both for
                 testing their work and for experimenting with new
                 language features.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "DLS '17 conference proceedings.",
}

@Article{Dig:2017:LRR,
  author =       "Danny Dig",
  title =        "The landscape of refactoring research in the last
                 decade (keynote)",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "1--1",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3148040",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In the last decade refactoring research has seen an
                 exponential growth. I will attempt to map this vast
                 landscape and the advances that the community has made
                 by answering questions such as who does what, when,
                 where, with who, why, and how. I will muse on some of
                 the factors contributing to the growth of the field,
                 the adoption of research into industry, and the lessons
                 that we learned along this journey. This will inspire
                 and equip you so that you can make a difference, with
                 people who make a difference, at a time when it makes a
                 difference.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Ge:2017:RSM,
  author =       "Rui Ge and Ronald Garcia",
  title =        "Refining semantics for multi-stage programming",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "2--14",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136047",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The multi-stage programming paradigm supports runtime
                 code generation and execution. Though powerful, its
                 potential is impeded by the lack of static analysis
                 support. Van Horn and Might proposed a general-purpose
                 approach to systematically develop static analyses by
                 transforming an environmental abstract machine, which
                 evolves a control string, an environment and a
                 continuation as a program evaluates. To the best of our
                 knowledge, no such semantics exists for a multi-stage
                 language like MetaML. We develop and prove correct an
                 environmental abstract machine semantics for MetaML by
                 gradually refining the reference substitutional
                 structural operational semantics. Highlights of our
                 approach include leveraging explicit substitutions to
                 bridge the gap between substitutional and environmental
                 semantics, and devising meta-environments to model the
                 complexities of variable bindings in multi-stage
                 environmental semantics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Ofenbeck:2017:SGP,
  author =       "Georg Ofenbeck and Tiark Rompf and Markus
                 P{\"u}schel",
  title =        "Staging for generic programming in space and time",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "15--28",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136060",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Metaprogramming is among the most promising candidates
                 to solve the abstraction vs performance trade-off that
                 plagues software engineering through specialization.
                 Metaprogramming has been used to enable low-overhead
                 generic programming for a long time, with C++ templates
                 being one of the most prominent examples. But often a
                 single, fixed pattern of specialization is not enough,
                 and more flexibility is needed. Hence, this paper seeks
                 to apply generic programming techniques to challenges
                 in metaprogramming, in particular to abstract over the
                 execution stage of individual program expressions. We
                 thus extend the scope of generic programming into the
                 dimension of time. The resulting notion of stage
                 polymorphism enables novel abstractions in the design
                 of program generators, which we develop and explore in
                 this paper. We present one possible implementation, in
                 Scala using the lightweight modular staging (LMS)
                 framework, and apply it to two important case studies:
                 convolution on images and the fast Fourier transform
                 (FFT).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Oishi:2017:SCT,
  author =       "Junpei Oishi and Yukiyoshi Kameyama",
  title =        "Staging with control: type-safe multi-stage
                 programming with control operators",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "29--40",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136049",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Staging allows a programmer to write domain-specific,
                 custom code generators. Ideally, a programming language
                 for staging provides all necessary features for
                 staging, and at the same time, gives static guarantee
                 for the safety properties of generated code including
                 well typedness and well scopedness. We address this
                 classic problem for the language with control
                 operators, which allow code optimizations in a modular
                 and compact way. Specifically, we design a staged
                 programming language with the expressive control
                 operators shift0 and reset0, which let us express, for
                 instance, multi-layer let-insertion, while keeping the
                 static guarantee of well typedness and well scopedness.
                 For this purpose, we extend our earlier work on refined
                 environment classifiers which were introduced for the
                 staging language with state. We show that our language
                 is expressive enough to express interesting code
                 generation techniques, and that the type system enjoys
                 type soundness. We also mention a type inference
                 algorithm for our language under reasonable
                 restriction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Courtes:2017:CSG,
  author =       "Ludovic Court{\`e}s",
  title =        "Code staging in {GNU Guix}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "41--48",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136045",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/linux.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  abstract =     "GNU Guix is a ``functional'' package manager that
                 borrows from earlier work on Nix by Dolstra et al..
                 Guix implements high-level abstractions such as
                 packages and operating system services as
                 domain-specific languages (DSL) embedded in Scheme, and
                 it also implements build actions and operating system
                 orchestration in Scheme. This leads to a multi-tier
                 programming environment where embedded code snippets
                 are staged for eventual execution. In this paper we
                 present G-expressions or ``gexps''. We explain our
                 journey from traditional Lisp S-expressions to
                 G-expressions, which augment the former with contextual
                 information, and we discuss the implementation of
                 gexps. We report on our experience using gexps in a
                 variety of operating system use cases --- from package
                 build processes to system services. Gexps provide a
                 novel way to cover many aspects of OS configuration in
                 a single, multi-tier language while facilitating code
                 reuse and code sharing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Linsbauer:2017:CVC,
  author =       "Lukas Linsbauer and Thorsten Berger and Paul
                 Gr{\"u}nbacher",
  title =        "A classification of variation control systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "49--62",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136054",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Version control systems are an integral part of
                 today's software and systems development processes.
                 They facilitate the management of revisions (sequential
                 versions) and variants (concurrent versions) of a
                 system under development and enable collaboration
                 between developers. Revisions are commonly maintained
                 either per file or for the whole system. Variants are
                 supported via branching or forking mechanisms that
                 conceptually clone the whole system under development.
                 It is known that such cloning practices come with
                 disadvantages. In fact, while short-lived branches for
                 isolated development of new functionality (a.k.a.
                 feature branches) are well supported, dealing with
                 long-term and fine-grained system variants currently
                 requires employing additional mechanisms, such as
                 preprocessors, build systems or custom configuration
                 tools. Interestingly, the literature describes a number
                 of variation control systems, which provide a richer
                 set of capabilities for handling fine-grained system
                 variants compared to the version control systems widely
                 used today. In this paper we present a classification
                 and comparison of selected variation control systems to
                 get an understanding of their capabilities and the
                 advantages they can offer. We discuss problems of
                 variation control systems, which may explain their
                 comparably low popularity. We also propose research
                 activities we regard as important to change this
                 situation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Lapena:2017:AIN,
  author =       "Ra{\'u}l Lape{\~n}a and Jaime Font and {\'O}scar
                 Pastor and Carlos Cetina",
  title =        "Analyzing the impact of natural language processing
                 over feature location in models",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "63--76",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136052",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Feature Location (FL) is a common task in the Software
                 Engineering field, specially in maintenance and
                 evolution of software products. The results of FL
                 depend in a great manner in the style in which Feature
                 Descriptions and software artifacts are written.
                 Therefore, Natural Language Processing (NLP) techniques
                 are used to process them. Through this paper, we
                 analyze the influence of the most common NLP techniques
                 over FL in Conceptual Models through Latent Semantic
                 Indexing, and the influence of human participation when
                 embedding domain knowledge in the process. We evaluated
                 the techniques in a real-world industrial case study in
                 the rolling stocks domain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Fenske:2017:HPA,
  author =       "Wolfram Fenske and Sandro Schulze and Gunter Saake",
  title =        "How preprocessor annotations (do not) affect
                 maintainability: a case study on change-proneness",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "77--90",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136059",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Preprocessor annotations (e.g., \#ifdef in C) enable
                 the development of similar, but distinct software
                 variants from a common code base. One particularly
                 popular preprocessor is the C preprocessor, cpp. But
                 the cpp is also widely criticized for impeding software
                 maintenance by making code hard to understand and
                 change. Yet, evidence to support this criticism is
                 scarce. In this paper, we investigate the relation
                 between cpp usage and maintenance effort, which we
                 approximate with the frequency and extent of source
                 code changes. To this end, we mined the version control
                 repositories of eight open-source systems written in C.
                 For each system, we measured if and how individual
                 functions use cpp annotations and how they were
                 changed. We found that functions containing cpp
                 annotations are generally changed more frequently and
                 more profoundly than other functions. However, when
                 accounting for function size, the differences disappear
                 or are greatly diminished. In summary, with respect to
                 the frequency and extent of changes, our findings do
                 not support the criticism of the cpp regarding
                 maintainability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Carlson:2017:TQC,
  author =       "Travis Carlson and Eric {Van Wyk}",
  title =        "Type qualifiers as composable language extensions",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "91--103",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136055",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper reformulates type qualifiers as language
                 extensions that can be automatically and reliably
                 composed. Type qualifiers annotate type expressions to
                 introduce new subtyping relations and are powerful
                 enough to detect many kinds of errors. Type qualifiers,
                 as illustrated in our ableC extensible language
                 framework for C, can introduce rich forms of concrete
                 syntax, can generate dynamic checks on data when static
                 checks are infeasible or not appropriate, and inject
                 code that affects the program's behavior, for example
                 for conversions of data or logging. ableC language
                 extensions to C are implemented as attribute grammar
                 fragments and provide an expressive mechanism for type
                 qualifier implementations to check for additional
                 errors, e.g. dereferences to pointers not qualified by
                 a ``nonnull'' qualifier, and report custom error
                 messages. Our approach distinguishes language extension
                 users from developers and provides modular analyses to
                 developers to ensure that when users select a set of
                 extensions to use, they will automatically compose to
                 form a working compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Rosa:2017:ARC,
  author =       "Andrea Ros{\`a} and Eduardo Rosales and Walter
                 Binder",
  title =        "Accurate reification of complete supertype information
                 for dynamic analysis on the {JVM}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "104--116",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136061",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Reflective supertype information (RSI) is useful for
                 many instrumentation-based dynamic analyses on the Java
                 Virtual Machine (JVM). On the one hand, while such
                 information can be obtained when performing the
                 instrumentation within the same JVM process executing
                 the instrumented program, in-process instrumentation
                 severely limits the code coverage of the analysis. On
                 the other hand, performing the instrumentation in a
                 separate process can achieve full code coverage, but
                 complete RSI is generally not available, often
                 requiring expensive runtime checks in the instrumented
                 program. Providing accurate and complete RSI in the
                 instrumentation process is challenging because of
                 dynamic class loading and classloader namespaces. In
                 this paper, we present a novel technique to accurately
                 reify complete RSI in a separate instrumentation
                 process. We implement our technique in the dynamic
                 analysis framework DiSL and evaluate it on a task
                 profiler, achieving speedups of up to 45\% for an
                 analysis with full code coverage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Pearce:2017:RSC,
  author =       "David J. Pearce",
  title =        "Rewriting for sound and complete union, intersection
                 and negation types",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "117--130",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136042",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Implementing the type system of a programming language
                 is a critical task that is often done in an ad-hoc
                 fashion. Whilst this makes it hard to ensure the system
                 is sound, it also makes it difficult to extend as the
                 language evolves. We are interested in describing type
                 systems using declarative rewrite rules from which an
                 implementation can be automatically generated. Whilst
                 not all type systems are easily expressed in this
                 manner, those involving unions, intersections and
                 negations are well-suited for this. In this paper, we
                 consider a relatively complex type system involving
                 unions, intersections and negations developed
                 previously. This system was not developed with
                 rewriting in mind, though clear parallels are
                 immediately apparent from the original presentation.
                 For example, the system presented required types be
                 first converted into a variation on Disjunctive Normal
                 Form. We identify that the original system can, for the
                 most part, be reworked to enable a natural expression
                 using declarative rewrite rules. We present an
                 implementation of our rewrite rules in the Whiley
                 Rewrite Language (WyRL), and report performance results
                 compared with a hand-coded solution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Parreaux:2017:QSR,
  author =       "Lionel Parreaux and Amir Shaikhha and Christoph E.
                 Koch",
  title =        "Quoted staged rewriting: a practical approach to
                 library-defined optimizations",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "131--145",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136043",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "Staging has proved a successful technique for
                 programmatically removing code abstractions, thereby
                 allowing for faster program execution while retaining a
                 high-level interface for the programmer. Unfortunately,
                 techniques based on staging suffer from a number of
                 problems --- ranging from practicalities to fundamental
                 limitations --- which have prevented their widespread
                 adoption. We introduce Quoted Staged Rewriting (QSR),
                 an approach that uses type-safe, pattern
                 matching-enabled quasiquotes to define optimizations.
                 The approach is ``staged'' in two ways: first, rewrite
                 rules can execute arbitrary code during pattern
                 matching and code reconstruction, leveraging the power
                 and flexibility of staging; second, library designers
                 can orchestrate the application of successive rewriting
                 phases (stages). The advantages of using
                 quasiquote-based rewriting are that library designers
                 never have to deal directly with the intermediate
                 representation (IR), and that it allows for
                 non-intrusive optimizations --- in contrast with
                 staging, it is not necessary to adapt the entire
                 library and user programs to accommodate optimizations.
                 We show how Squid, a Scala macro-based framework,
                 enables QSR and renders library-defined optimizations
                 more practical than ever before: library designers
                 write domain-specific optimizers that users invoke
                 transparently on delimited portions of their code base.
                 As a motivating example we describe an implementation
                 of stream fusion (a well-known deforestation technique)
                 that is both simpler and more powerful than the state
                 of the art, and can readily be used by Scala
                 programmers with no knowledge of metaprogramming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Caldwell:2017:RCC,
  author =       "Joseph Caldwell and Shigeru Chiba",
  title =        "Reducing calling convention overhead in
                 object-oriented programming on embedded {ARM Thumb-2}
                 platforms",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "146--156",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136057",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper examines the causes and extent of code size
                 overhead caused by the ARM calling convention in
                 Thumb-2 binaries. We show that binaries generated from
                 C++ source files generally have higher amounts of
                 calling convention overhead, and present a binary file
                 optimizer to eliminate some of that overhead. Calling
                 convention overhead can negatively impact power
                 consumption, flash memory costs, and chip size in
                 embedded or otherwise resource-constrained domains.
                 This is particularly true on platforms using
                 ``compressed'' instruction sets, such as the 16-bit ARM
                 Thumb and Thumb-2 instruction sets, used in virtually
                 all smartphones and in many other smaller-scale
                 embedded devices. In this paper, we examine the extent
                 of calling convention overhead in practical software,
                 and compare the results of C and C++ programs, and find
                 that C++ programs generally have a higher percentage of
                 calling-convention overhead. Finally, we demonstrate a
                 tool capable of eliminating some of this overhead,
                 particularly in the case of C++ programs, by modifying
                 the calling conventions on a per-procedure basis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Perard-Gayot:2017:RSE,
  author =       "Ars{\`e}ne P{\'e}rard-Gayot and Martin Weier and
                 Richard Membarth and Philipp Slusallek and Roland
                 Lei{\ss}a and Sebastian Hack",
  title =        "{RaTrace}: simple and efficient abstractions for {BVH}
                 ray traversal algorithms",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "157--168",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136044",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In order to achieve the highest possible performance,
                 the ray traversal and intersection routines at the core
                 of every high-performance ray tracer are usually
                 hand-coded, heavily optimized, and implemented
                 separately for each hardware platform-even though they
                 share most of their algorithmic core. The results are
                 implementations that heavily mix algorithmic aspects
                 with hardware and implementation details, making the
                 code non-portable and difficult to change and maintain.
                 In this paper, we present a new approach that offers
                 the ability to define in a functional language a set of
                 conceptual, high-level language abstractions that are
                 optimized away by a special compiler in order to
                 maximize performance. Using this abstraction mechanism
                 we separate a generic ray traversal and intersection
                 algorithm from its low-level aspects that are specific
                 to the target hardware. We demonstrate that our code is
                 not only significantly more flexible, simpler to write,
                 and more concise but also that the compiled results
                 perform as well as state-of-the-art implementations on
                 any of the tested CPU and GPU platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Susungi:2017:TCG,
  author =       "Adilla Susungi and Norman A. Rink and Jer{\'o}nimo
                 Castrill{\'o}n and Immo Huismann and Albert Cohen and
                 Claude Tadonki and J{\"o}rg Stiller and Jochen
                 Fr{\"o}hlich",
  title =        "Towards compositional and generative tensor
                 optimizations",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "169--175",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136050",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many numerical algorithms are naturally expressed as
                 operations on tensors (i.e. multi-dimensional arrays).
                 Hence, tensor expressions occur in a wide range of
                 application domains, e.g. quantum chemistry and
                 physics; big data analysis and machine learning; and
                 computational fluid dynamics. Each domain, typically,
                 has developed its own strategies for efficiently
                 generating optimized code, supported by tools such as
                 domain-specific languages, compilers, and libraries.
                 However, strategies and tools are rarely portable
                 between domains, and generic solutions typically act as
                 ''black boxes'' that offer little control over code
                 generation and optimization. As a consequence, there
                 are application domains without adequate support for
                 easily generating optimized code, e.g. computational
                 fluid dynamics. In this paper we propose a generic and
                 easily extensible intermediate language for expressing
                 tensor computations and code transformations in a
                 modular and generative fashion. Beyond being an
                 intermediate language, our solution also offers
                 meta-programming capabilities for experts in code
                 optimization. While applications from the domain of
                 computational fluid dynamics serve to illustrate our
                 proposed solution, we believe that our general approach
                 can help unify research in tensor optimizations and
                 make solutions more portable between domains.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Al-Kofahi:2017:FLL,
  author =       "Jafar M. Al-Kofahi and Suresh Kothari and Christian
                 K{\"a}stner",
  title =        "Four languages and lots of macros: analyzing autotools
                 build systems",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "176--186",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136051",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Build systems are crucial for software system
                 development, however there is a lack of tool support to
                 help with their high maintenance overhead. GNU
                 Autotools are widely used in the open source community,
                 but users face various challenges from its hard to
                 comprehend nature and staging of multiple code
                 generation steps, often leading to low quality and
                 error-prone build code. In this paper, we present a
                 platform, AutoHaven, to provide a foundation for
                 developers to create analysis tools to help them
                 understand, maintain, and migrate their GNU Autotools
                 build systems. Internally it uses approximate parsing
                 and symbolic analysis of the build logic. We illustrate
                 the use of the platform with two tools: ACSense helps
                 developers to better understand their build systems and
                 ACSniff detects build smells to improve build code
                 quality. Our evaluation shows that AutoHaven can
                 support most GNU Autotools build systems and can detect
                 build smells in the wild.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Fernandes:2017:AUM,
  author =       "Leonardo Fernandes and M{\'a}rcio Ribeiro and Luiz
                 Carvalho and Rohit Gheyi and Melina Mongiovi and
                 Andr{\'e} Santos and Ana Cavalcanti and Fabiano Ferrari
                 and Jos{\'e} Carlos Maldonado",
  title =        "Avoiding useless mutants",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "187--198",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136053",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mutation testing is a program-transformation technique
                 that injects artificial bugs to check whether the
                 existing test suite can detect them. However, the costs
                 of using mutation testing are usually high, hindering
                 its use in industry. Useless mutants (equivalent and
                 duplicated) contribute to increase costs. Previous
                 research has focused mainly on detecting useless
                 mutants only after they are generated and compiled. In
                 this paper, we introduce a strategy to help developers
                 with deriving rules to avoid the generation of useless
                 mutants. To use our strategy, we pass as input a set of
                 programs. For each program, we also need a passing test
                 suite and a set of mutants. As output, our strategy
                 yields a set of useless mutants candidates. After
                 manually confirming that the mutants classified by our
                 strategy as ``useless'' are indeed useless, we derive
                 rules that can avoid their generation and thus decrease
                 costs. To the best of our knowledge, we introduce 37
                 new rules that can avoid useless mutants right before
                 their generation. We then implement a subset of these
                 rules in the MUJAVA mutation testing tool. Since our
                 rules have been derived based on artificial and small
                 Java programs, we take our MUJAVA version embedded with
                 our rules and execute it in industrial-scale projects.
                 Our rules reduced the number of mutants by almost 13\%
                 on average. Our results are promising because (i) we
                 avoid useless mutants generation; (ii) our strategy can
                 help with identifying more rules in case we set it to
                 use more complex Java programs; and (iii) our MUJAVA
                 version has only a subset of the rules we derived.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Nakamaru:2017:SFA,
  author =       "Tomoki Nakamaru and Kazuhiro Ichikawa and Tetsuro
                 Yamazaki and Shigeru Chiba",
  title =        "{Silverchain}: a fluent {API} generator",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "199--211",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136041",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a tool named Silverchain, which
                 generates class definitions for a fluent API from the
                 grammar of the API. A fluent API is an API that is used
                 by method chaining and its grammar is a BNF-like set of
                 rules that defines method chains accepted in type
                 checking. Fluent APIs generated by Silverchain provide
                 two styles of APIs: One is for building a chain by
                 concatenating all method calls in series. The other is
                 for building a chain from partial chains by passing
                 child chains to method calls in the parent chain as
                 their arguments. To generate such a fluent API,
                 Silverchain first translates given grammar into a set
                 of deterministic pushdown automata without
                 {\~N}-transitions, then encodes these automata into
                 class definitions. Each constructed automata
                 corresponds to a nonterminal in given grammar and
                 recognizes symbol sequences produced from its
                 corresponding nonterminal.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Zaytsev:2017:PGE,
  author =       "Vadim Zaytsev",
  title =        "Parser generation by example for legacy pattern
                 languages",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "212--218",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136058",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Most modern software languages enjoy relatively free
                 and relaxed concrete syntax, with significant
                 flexibility of formatting of the program/model/sheet
                 text. Yet, in the dark legacy corners of software
                 engineering there are still languages with a strict
                 fixed column-based structure --- the compromises of
                 times long gone, attempting to combine some human
                 readability with some ease of machine processing. In
                 this paper, we consider an industrial case study for
                 retirement of a legacy domain-specific language,
                 completed under extreme circumstances: absolute lack of
                 documentation, varying line structure, hierarchical
                 blocks within one file, scalability demands for
                 millions of lines of code, performance demands for
                 manipulating tens of thousands multi-megabyte files,
                 etc. However, the regularity of the language allowed to
                 infer its structure from the available examples,
                 automatically, and produce highly efficient parsers for
                 it.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Mainland:2017:HCS,
  author =       "Geoffrey Mainland and Jeremy Johnson",
  title =        "A {Haskell} compiler for signal transforms",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "219--232",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136056",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Building a reusable, auto-tuning code generator from
                 scratch is a challenging problem, requiring many
                 careful design choices. We describe HSpiral, a Haskell
                 compiler for signal transforms that builds on the
                 foundational work of Spiral. Our design leverages many
                 Haskell language features to ensure that our framework
                 is reusable, flexible, and efficient. As well as
                 describing the design of our system, we show how to
                 extend it to support new classes of transforms,
                 including the number-theoretic transform and a variant
                 of the split-radix algorithm that results in reduced
                 operation counts. We also show how to incorporate
                 rewrite rules into our system to reproduce results from
                 previous literature on code generation for the fast
                 Fourier transform. Although the Spiral project
                 demonstrated significant advances in automatic code
                 generation, it has not been widely used by other
                 researchers. HSpiral is freely available under an
                 MIT-style license, and we are actively working to turn
                 it into a tool to further both our own research goals
                 and to serve as a foundation for other research groups'
                 work in developing new implementations of signal
                 transform algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Martini:2017:AGV,
  author =       "Ricardo Giuliani Martini and Pedro Rangel Henriques",
  title =        "Automatic generation of virtual learning spaces driven
                 by {CaVa DSL}: an experience report",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "233--245",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136046",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Several applications are based on Domain-Specific
                 Languages (DSL). They provide the right terminology to
                 a peculiar problem/subject, because they use a
                 particular domain vocabulary that defines abstract
                 concepts, different from general-purpose languages.
                 Aiming an easy generation of virtual Learning Spaces
                 (LS) for the use of the responsible of institutional
                 archives or museums, we have idealized and developed an
                 external domain-specific language, called CaVa DSL, to
                 describe, in an abstract level, virtual exhibition
                 rooms in the museum curator's viewpoint, giving the
                 curator the possibility to specify the virtual LS upon
                 a domain ontology vocabulary. We also contribute with a
                 set of processors that deal with CaVa DSL and generates
                 virtual Learning Spaces, turning available the
                 navigation over important and real information
                 contained in archival documents to the public through
                 virtual museums. To demonstrate the obtained results,
                 we present a running example along the paper showing
                 the virtual LS generation process.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Grebe:2017:RSD,
  author =       "Mark Grebe and David Young and Andy Gill",
  title =        "Rewriting a shallow {DSL} using a {GHC} compiler
                 extension",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "12",
  pages =        "246--258",
  month =        dec,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3170492.3136048",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Fri Dec 1 18:56:14 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Embedded Domain Specific Languages are a powerful tool
                 for developing customized languages to fit specific
                 problem domains. Shallow EDSLs allow a programmer to
                 program using many of the features of a host language
                 and its syntax, but sacrifice performance. Deep EDSLs
                 provide better performance and flexibility, through the
                 ability to manipulate the abstract syntax tree of the
                 DSL program, but sacrifice syntactical similarity to
                 the host language. Using Haskino, an EDSL designed for
                 small embedded systems based on the Arduino line of
                 microcontrollers, and a compiler plugin for the Haskell
                 GHC compiler, we show a method for combining the best
                 aspects of shallow and deep EDSLs. The programmer is
                 able to write in the shallow EDSL, and have it
                 automatically transformed into the deep EDSL. This
                 allows the EDSL user to benefit from powerful aspects
                 of the host language, Haskell, while meeting the
                 demanding resource constraints of the small embedded
                 processing environment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "GPCE '17 conference proceedings.",
}

@Article{Wen:2018:IBM,
  author =       "Haosen Wen and Joseph Izraelevitz and Wentao Cai and
                 H. Alan Beadle and Michael L. Scott",
  title =        "Interval-based memory reclamation",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "1--13",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178488",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper we present interval-based reclamation
                 (IBR), a new approach to safe reclamation of
                 disconnected memory blocks in nonblocking concurrent
                 data structures. Safe reclamation is a difficult
                 problem: a thread, before freeing a block, must ensure
                 that no other threads are accessing that block; the
                 required synchronization tends to be expensive. In
                 contrast with epoch-based reclamation, in which threads
                 reserve all blocks created after a certain time, or
                 pointer-based reclamation (e.g., hazard pointers), in
                 which threads reserve individual blocks, IBR allows a
                 thread to reserve all blocks known to have existed in a
                 bounded interval of time. By comparing a thread's
                 reserved interval with the lifetime of a detached but
                 not yet reclaimed block, the system can determine if
                 the block is safe to free. Like hazard pointers, IBR
                 avoids the possibility that a single stalled thread may
                 reserve an unbounded number of blocks; unlike hazard
                 pointers, it avoids a memory fence on most
                 pointer-following operations. It also avoids the need
                 to explicitly ``unreserve'' a no-longer-needed pointer.
                 We describe three specific IBR schemes (one with
                 several variants) that trade off performance,
                 applicability, and space requirements. IBR requires no
                 special hardware or OS support. In experiments with
                 data structure microbenchmarks, it also compares
                 favorably (in both time and space) to other
                 state-of-the-art approaches, making it an attractive
                 alternative for libraries of concurrent data
                 structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Arbel-Raviv:2018:HEB,
  author =       "Maya Arbel-Raviv and Trevor Brown",
  title =        "Harnessing epoch-based reclamation for efficient range
                 queries",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "14--27",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178489",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent sets with range query operations are highly
                 desirable in applications such as in-memory databases.
                 However, few set implementations offer range queries.
                 Known techniques for augmenting data structures with
                 range queries (or operations that can be used to build
                 range queries) have numerous problems that limit their
                 usefulness. For example, they impose high overhead or
                 rely heavily on garbage collection. In this work, we
                 show how to augment data structures with highly
                 efficient range queries, without relying on garbage
                 collection. We identify a property of epoch-based
                 memory reclamation algorithms that makes them ideal for
                 implementing range queries, and produce three
                 algorithms, which use locks, transactional memory and
                 lock-free techniques, respectively. Our algorithms are
                 applicable to more data structures than previous work,
                 and are shown to be highly efficient on a large scale
                 Intel system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Friedman:2018:PLF,
  author =       "Michal Friedman and Maurice Herlihy and Virendra
                 Marathe and Erez Petrank",
  title =        "A persistent lock-free queue for non-volatile memory",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "28--40",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178490",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Non-volatile memory is expected to coexist with (or
                 even displace) volatile DRAM for main memory in
                 upcoming architectures. This has led to increasing
                 interest in the problem of designing and specifying
                 durable data structures that can recover from system
                 crashes. Data structures may be designed to satisfy
                 stricter or weaker durability guarantees to provide a
                 balance between the strength of the provided guarantees
                 and performance overhead. This paper proposes three
                 novel implementations of a concurrent lock-free queue.
                 These implementations illustrate algorithmic challenges
                 in building persistent lock-free data structures with
                 different levels of durability guarantees. In
                 presenting these challenges, the proposed algorithmic
                 designs, and the different durability guarantees, we
                 hope to shed light on ways to build a wide variety of
                 durable data structures. We implemented the various
                 designs and compared their performance overhead to a
                 simple queue design for standard (volatile) memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Wang:2018:SDG,
  author =       "Linnan Wang and Jinmian Ye and Yiyang Zhao and Wei Wu
                 and Ang Li and Shuaiwen Leon Song and Zenglin Xu and
                 Tim Kraska",
  title =        "Superneurons: dynamic {GPU} memory management for
                 training deep neural networks",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "41--53",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178491",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Going deeper and wider in neural architectures
                 improves their accuracy, while the limited GPU DRAM
                 places an undesired restriction on the network design
                 domain. Deep Learning (DL) practitioners either need to
                 change to less desired network architectures, or
                 nontrivially dissect a network across multiGPUs. These
                 distract DL practitioners from concentrating on their
                 original machine learning tasks. We present
                 SuperNeurons: a dynamic GPU memory scheduling runtime
                 to enable the network training far beyond the GPU DRAM
                 capacity. SuperNeurons features 3 memory optimizations,
                 Liveness Analysis, Unified Tensor Pool, and Cost-Aware
                 Recomputation; together they effectively reduce the
                 network-wide peak memory usage down to the maximal
                 memory usage among layers. We also address the
                 performance issues in these memory-saving techniques.
                 Given the limited GPU DRAM, SuperNeurons not only
                 provisions the necessary memory for the training, but
                 also dynamically allocates the memory for convolution
                 workspaces to achieve the high performance. Evaluations
                 against Caffe, Torch, MXNet and TensorFlow have
                 demonstrated that SuperNeurons trains at least 3.2432
                 deeper network than current ones with the leading
                 performance. Particularly, SuperNeurons can train
                 ResNet2500 that has 10$^4$ basic network layers on a
                 12GB K40c.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Belviranli:2018:JDA,
  author =       "Mehmet E. Belviranli and Seyong Lee and Jeffrey S.
                 Vetter and Laxmi N. Bhuyan",
  title =        "{Juggler}: a dependence-aware task-based execution
                 framework for {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "54--67",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178492",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scientific applications with single instruction,
                 multiple data (SIMD) computations show considerable
                 performance improvements when run on today's graphics
                 processing units (GPUs). However, the existence of data
                 dependences across thread blocks may significantly
                 impact the speedup by requiring global synchronization
                 across multiprocessors (SMs) inside the GPU. To
                 efficiently run applications with interblock data
                 dependences, we need fine-granular task-based execution
                 models that will treat SMs inside a GPU as stand-alone
                 parallel processing units. Such a scheme will enable
                 faster execution by utilizing all internal computation
                 elements inside the GPU and eliminating unnecessary
                 waits during device-wide global barriers. In this
                 paper, we propose Juggler, a task-based execution
                 scheme for GPU workloads with data dependences. The
                 Juggler framework takes applications embedding OpenMP
                 4.5 tasks as input and executes them on the GPU via an
                 efficient in-device runtime, hence eliminating the need
                 for kernel-wide global synchronization. Juggler
                 requires no or little modification to the source code,
                 and once launched, the runtime entirely runs on the GPU
                 without relying on the host through the entire
                 execution. We have evaluated Juggler on an NVIDIA Tesla
                 P100 GPU and obtained up to 31\% performance
                 improvement against global barrier based
                 implementation, with minimal runtime overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Kotsifakou:2018:HHP,
  author =       "Maria Kotsifakou and Prakalp Srivastava and Matthew D.
                 Sinclair and Rakesh Komuravelli and Vikram Adve and
                 Sarita Adve",
  title =        "{HPVM}: heterogeneous parallel virtual machine",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "68--80",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178493",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "We propose a parallel program representation for
                 heterogeneous systems, designed to enable performance
                 portability across a wide range of popular parallel
                 hardware, including GPUs, vector instruction sets,
                 multicore CPUs and potentially FPGAs. Our
                 representation, which we call HPVM, is a hierarchical
                 dataflow graph with shared memory and vector
                 instructions. HPVM supports three important
                 capabilities for programming heterogeneous systems: a
                 compiler intermediate representation (IR), a virtual
                 instruction set (ISA), and a basis for runtime
                 scheduling; previous systems focus on only one of these
                 capabilities. As a compiler IR, HPVM aims to enable
                 effective code generation and optimization for
                 heterogeneous systems. As a virtual ISA, it can be used
                 to ship executable programs, in order to achieve both
                 functional portability and performance portability
                 across such systems. At runtime, HPVM enables flexible
                 scheduling policies, both through the graph structure
                 and the ability to compile individual nodes in a
                 program to any of the target devices on a system. We
                 have implemented a prototype HPVM system, defining the
                 HPVM IR as an extension of the LLVM compiler IR,
                 compiler optimizations that operate directly on HPVM
                 graphs, and code generators that translate the virtual
                 ISA to NVIDIA GPUs, Intel's AVX vector units, and to
                 multicore X86-64 processors. Experimental results show
                 that HPVM optimizations achieve significant performance
                 improvements, HPVM translators achieve performance
                 competitive with manually developed OpenCL code for
                 both GPUs and vector hardware, and that runtime
                 scheduling policies can make use of both program and
                 runtime information to exploit the flexible compilation
                 capabilities. Overall, we conclude that the HPVM
                 representation is a promising basis for achieving
                 performance portability and for implementing
                 parallelizing compilers for heterogeneous parallel
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Guatto:2018:HMM,
  author =       "Adrien Guatto and Sam Westrick and Ram Raghunathan and
                 Umut Acar and Matthew Fluet",
  title =        "Hierarchical memory management for mutable state",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "81--93",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178494",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is well known that modern functional programming
                 languages are naturally amenable to parallel
                 programming. Achieving efficient parallelism using
                 functional languages, however, remains difficult.
                 Perhaps the most important reason for this is their
                 lack of support for efficient in-place updates, i.e.,
                 mutation, which is important for the implementation of
                 both parallel algorithms and the run-time system
                 services (e.g., schedulers and synchronization
                 primitives) used to execute them. In this paper, we
                 propose techniques for efficient mutation in parallel
                 functional languages. To this end, we couple the memory
                 manager with the thread scheduler to make reading and
                 updating data allocated by nested threads efficient. We
                 describe the key algorithms behind our technique,
                 implement them in the MLton Standard ML compiler, and
                 present an empirical evaluation. Our experiments show
                 that the approach performs well, significantly
                 improving efficiency over existing functional language
                 implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Zhao:2018:BGB,
  author =       "Yue Zhao and Jiajia Li and Chunhua Liao and Xipeng
                 Shen",
  title =        "Bridging the gap between deep learning and sparse
                 matrix format selection",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "94--108",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178495",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This work presents a systematic exploration on the
                 promise and special challenges of deep learning for
                 sparse matrix format selection---a problem of
                 determining the best storage format for a matrix to
                 maximize the performance of Sparse Matrix Vector
                 Multiplication (SpMV). It describes how to effectively
                 bridge the gap between deep learning and the special
                 needs of the pillar HPC problem through a set of
                 techniques on matrix representations, deep learning
                 structure, and cross-architecture model migrations. The
                 new solution cuts format selection errors by two
                 thirds, and improves SpMV performance by 1.73X on
                 average over the state of the art.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Jia:2018:ODW,
  author =       "Zhen Jia and Aleksandar Zlateski and Fredo Durand and
                 Kai Li",
  title =        "Optimizing {$N$}-dimensional, {Winograd}-based
                 convolution for manycore {CPUs}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "109--123",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178496",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent work on Winograd-based convolution allows for a
                 great reduction of computational complexity, but
                 existing implementations are limited to 2D data and a
                 single kernel size of 3 by 3. They can achieve only
                 slightly better, and often worse performance than
                 better optimized, direct convolution implementations.
                 We propose and implement an algorithm for N-dimensional
                 Winograd-based convolution that allows arbitrary kernel
                 sizes and is optimized for manycore CPUs. Our algorithm
                 achieves high hardware utilization through a series of
                 optimizations. Our experiments show that on modern
                 ConvNets, our optimized implementation, is on average
                 more than 3 x, and sometimes 8 x faster than other
                 state-of-the-art CPU implementations on an Intel Xeon
                 Phi manycore processors. Moreover, our implementation
                 on the Xeon Phi achieves competitive performance for 2D
                 ConvNets and superior performance for 3D ConvNets,
                 compared with the best GPU implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Tang:2018:VLF,
  author =       "Xiongchao Tang and Jidong Zhai and Xuehai Qian and
                 Bingsheng He and Wei Xue and Wenguang Chen",
  title =        "{vSensor}: leveraging fixed-workload snippets of
                 programs for performance variance detection",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "124--136",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178497",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Performance variance becomes increasingly challenging
                 on current large-scale HPC systems. Even using a fixed
                 number of computing nodes, the execution time of
                 several runs can vary significantly. Many parallel
                 programs executing on supercomputers suffer from such
                 variance. Performance variance not only causes
                 unpredictable performance requirement violations, but
                 also makes it unintuitive to understand the program
                 behavior. Despite prior efforts, efficient on-line
                 detection of performance variance remains an open
                 problem. In this paper, we propose vS ensor, a novel
                 approach for light-weight and on-line performance
                 variance detection. The key insight is that, instead of
                 solely relying on an external detector, the source code
                 of a program itself could reveal the runtime
                 performance characteristics. Specifically, many
                 parallel programs contain code snippets that are
                 executed repeatedly with an invariant quantity of work.
                 Based on this observation, we use compiler techniques
                 to automatically identify these fixed-workload snippets
                 and use them as performance variance sensors
                 (v-sensors) that enable effective detection. We
                 evaluate vSensor with a variety of parallel programs on
                 the Tianhe-2 system. Results show that vSensor can
                 effectively detect performance variance on HPC systems.
                 The performance overhead is smaller than 4\% with up to
                 16,384 processes. In particular, with vSensor, we found
                 a bad node with slow memory that slowed a program's
                 performance by 21\%. As a showcase, we also detected a
                 severe network performance problem that caused a 3.37X
                 slowdown for an HPC kernel program on the Tianhe-2
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Prokopec:2018:CTC,
  author =       "Aleksandar Prokopec",
  title =        "Cache-tries: concurrent lock-free hash tries with
                 constant-time operations",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "137--151",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178498",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent non-blocking hash tries have good cache
                 locality, and horizontally scalable operations.
                 However, operations on most existing concurrent hash
                 tries run in O (log n ) time. In this paper, we show
                 that the concurrent hash trie operations can run in
                 expected constant time. We present a novel lock-free
                 concurrent hash trie design that exerts less pressure
                 on the memory allocator. This hash trie is augmented
                 with a quiescently consistent cache, which permits the
                 basic operations to run in expected O (1) time. We show
                 a statistical analysis for the constant-time bound,
                 which, to the best of our knowledge, is the first such
                 proof for hash tries. We also prove the safety,
                 lock-freedom and linearizability properties. On typical
                 workloads, our implementation demonstrates up to 5X
                 performance improvements with respect to the previous
                 hash trie variants.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Chabbi:2018:FFF,
  author =       "Milind Chabbi and Shasha Wen and Xu Liu",
  title =        "Featherlight on-the-fly false-sharing detection",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "152--167",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178499",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Shared-memory parallel programs routinely suffer from
                 false sharing---a performance degradation caused by
                 different threads accessing different variables that
                 reside on the same CPU cacheline and at least one
                 variable is modified. State-of-the-art tools detect
                 false sharing via a heavyweight process of logging
                 memory accesses and feeding the ensuing access traces
                 to an offline cache simulator. We have developed
                 Feather, a lightweight, on-the-fly false-sharing
                 detection tool. Feather achieves low overhead by
                 exploiting two hardware features ubiquitous in
                 commodity CPUs: the performance monitoring units (PMU)
                 and debug registers. Additionally, Feather is a
                 first-of-its-kind tool to detect false sharing in
                 multi-process applications that use shared memory.
                 Feather allowed us to scale false-sharing detection to
                 myriad codes. Feather detected several false-sharing
                 cases in important multi-core and multi-process codes
                 including previous PPoPP artifacts. Eliminating false
                 sharing resulted in dramatic (up to 16x) speedups.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Rawat:2018:ROS,
  author =       "Prashant Singh Rawat and Fabrice Rastello and Aravind
                 Sukumaran-Rajam and Louis-No{\"e}l Pouchet and Atanas
                 Rountev and P. Sadayappan",
  title =        "Register optimizations for stencils on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "168--182",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178500",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The recent advent of compute-intensive GPU
                 architecture has allowed application developers to
                 explore high-order 3D stencils for better computational
                 accuracy. A common optimization strategy for such
                 stencils is to expose sufficient data reuse by means
                 such as loop unrolling, with the expectation of
                 register-level reuse. However, the resulting code is
                 often highly constrained by register pressure. While
                 current state-of-the-art register allocators are
                 satisfactory for most applications, they are unable to
                 effectively manage register pressure for such complex
                 high-order stencils, resulting in sub-optimal code with
                 a large number of register spills. In this paper, we
                 develop a statement reordering framework that models
                 stencil computations as a DAG of trees with shared
                 leaves, and adapts an optimal scheduling algorithm for
                 minimizing register usage for expression trees. The
                 effectiveness of the approach is demonstrated through
                 experimental results on a range of stencils extracted
                 from application codes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Zheng:2018:FPS,
  author =       "Da Zheng and Disa Mhembere and Joshua T. Vogelstein
                 and Carey E. Priebe and Randal Burns",
  title =        "{FlashR}: parallelize and scale {R} for machine
                 learning using {SSDs}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "183--194",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178501",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "R is one of the most popular programming languages for
                 statistics and machine learning, but it is slow and
                 unable to scale to large datasets. The general approach
                 for having an efficient algorithm in R is to implement
                 it in C or FORTRAN and provide an R wrapper. FlashR
                 accelerates and scales existing R code by parallelizing
                 a large number of matrix functions in the R base
                 package and scaling them beyond memory capacity with
                 solid-state drives (SSDs). FlashR performs memory
                 hierarchy aware execution to speed up parallelized R
                 code by (i) evaluating matrix operations lazily, (ii)
                 performing all operations in a DAG in a single
                 execution and with only one pass over data to increase
                 the ratio of computation to I/O, (iii) performing two
                 levels of matrix partitioning and reordering
                 computation on matrix partitions to reduce data
                 movement in the memory hierarchy. We evaluate FlashR on
                 various machine learning and statistics algorithms on
                 inputs of up to four billion data points. Despite the
                 huge performance gap between SSDs and RAM, FlashR on
                 SSDs closely tracks the performance of FlashR in memory
                 for many algorithms. The R implementations in FlashR
                 outperforms H$_2$O and Spark MLlib by a factor of 3 ---
                 20.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Menon:2018:DDC,
  author =       "Harshitha Menon and Kathryn Mohror",
  title =        "{DisCVar}: discovering critical variables using
                 algorithmic differentiation for transient faults",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "195--206",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178502",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Aggressive technology scaling trends have made the
                 hardware of high performance computing (HPC) systems
                 more susceptible to faults. Some of these faults can
                 lead to silent data corruption (SDC), and represent a
                 serious problem because they alter the HPC simulation
                 results. In this paper, we present a full-coverage,
                 systematic methodology called D isCVar to identify
                 critical variables in HPC applications for protection
                 against SDC. DisCVar uses automatic differentiation
                 (AD) to determine the sensitivity of the simulation
                 output to errors in program variables. We empirically
                 validate our approach in identifying vulnerable
                 variables by comparing the results against a
                 full-coverage code-level fault injection campaign. We
                 find that our DisCVar correctly identifies the
                 variables that are critical to ensure application SDC
                 resilience with a high degree of accuracy compared to
                 the results of the fault injection campaign.
                 Additionally, DisCVar requires only two executions of
                 the target program to generate results, whereas in our
                 experiments we needed to perform millions of executions
                 to get the same information from a fault injection
                 campaign.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Drachsler-Cohen:2018:PCT,
  author =       "Dana Drachsler-Cohen and Martin Vechev and Eran
                 Yahav",
  title =        "Practical concurrent traversals in search trees",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "207--218",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178503",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Operations of concurrent objects often employ
                 optimistic concurrency-control schemes that consist of
                 a traversal followed by a validation step. The
                 validation checks if concurrent mutations interfered
                 with the traversal to determine if the operation should
                 proceed or restart. A fundamental challenge is to
                 discover a necessary and sufficient validation check
                 that has to be performed to guarantee correctness. In
                 this paper, we show a necessary and sufficient
                 condition for validating traversals in search trees.
                 The condition relies on a new concept of succinct path
                 snapshots, which are derived from and embedded in the
                 structure of the tree. We leverage the condition to
                 design a general lock-free membership test suitable for
                 any search tree. We then show how to integrate the
                 validation condition in update operations of
                 (non-rebalancing) binary search trees, internal and
                 external, and AVL trees. We experimentally show that
                 our new algorithms outperform existing ones.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Gianinazzi:2018:CAP,
  author =       "Lukas Gianinazzi and Pavel Kalvoda and Alessandro {De
                 Palma} and Maciej Besta and Torsten Hoefler",
  title =        "Communication-avoiding parallel minimum cuts and
                 connected components",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "219--232",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178504",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present novel scalable parallel algorithms for
                 finding global minimum cuts and connected components,
                 which are important and fundamental problems in graph
                 processing. To take advantage of future massively
                 parallel architectures, our algorithms are
                 communication-avoiding: they reduce the costs of
                 communication across the network and the cache
                 hierarchy. The fundamental technique underlying our
                 work is the randomized sparsification of a graph:
                 removing a fraction of graph edges, deriving a solution
                 for such a sparsified graph, and using the result to
                 obtain a solution for the original input. We design and
                 implement sparsification with O (1) synchronization
                 steps. Our global minimum cut algorithm decreases
                 communication costs and computation compared to the
                 state-of-the-art, while our connected components
                 algorithm incurs few cache misses and synchronization
                 steps. We validate our approach by evaluating MPI
                 implementations of the algorithms on a petascale
                 supercomputer. We also provide an approximate variant
                 of the minimum cut algorithm and show that it
                 approximates the exact solutions well while using a
                 fraction of cores in a fraction of time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Khyzha:2018:SPT,
  author =       "Artem Khyzha and Hagit Attiya and Alexey Gotsman and
                 Noam Rinetzky",
  title =        "Safe privatization in transactional memory",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "233--245",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178505",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactional memory (TM) facilitates the development
                 of concurrent applications by letting the programmer
                 designate certain code blocks as atomic. Programmers
                 using a TM often would like to access the same data
                 both inside and outside transactions, e.g., to improve
                 performance or to support legacy code. In this case,
                 programmers would ideally like the TM to guarantee
                 strong atomicity, where transactions can be viewed as
                 executing atomically also with respect to
                 non-transactional accesses. Since guaranteeing strong
                 atomicity for arbitrary programs is prohibitively
                 expensive, researchers have suggested guaranteeing it
                 only for certain data-race free (DRF) programs,
                 particularly those that follow the privatization idiom:
                 from some point on, threads agree that a given object
                 can be accessed non-transactionally. Supporting
                 privatization safely in a TM is nontrivial, because
                 this often requires correctly inserting transactional
                 fences, which wait until all active transactions
                 complete. Unfortunately, there is currently no
                 consensus on a single definition of transactional DRF,
                 in particular, because no existing notion of DRF takes
                 into account transactional fences. In this paper we
                 propose such a notion and prove that, if a TM satisfies
                 a certain condition generalizing opacity and a program
                 using it is DRF assuming strong atomicity, then the
                 program indeed has strongly atomic semantics. We show
                 that our DRF notion allows the programmer to use
                 privatization idioms. We also propose a method for
                 proving our generalization of opacity and apply it to
                 the TL2 TM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Grossman:2018:MPB,
  author =       "Samuel Grossman and Heiner Litz and Christos
                 Kozyrakis",
  title =        "Making pull-based graph processing performant",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "246--260",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178506",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graph processing engines following either the
                 push-based or pull-based pattern conceptually consist
                 of a two-level nested loop structure. Parallelizing and
                 vectorizing these loops is critical for high overall
                 performance and memory bandwidth utilization. Outer
                 loop parallelization is simple for both engine types
                 but suffers from high load imbalance. This work focuses
                 on inner loop parallelization for pull engines, which
                 when performed naively leads to a significant increase
                 in conflicting memory writes that must be synchronized.
                 Our first contribution is a scheduler-aware interface
                 for parallel loops that allows us to optimize for the
                 common case in which each thread executes several
                 consecutive iterations. This eliminates most write
                 traffic and avoids all synchronization, leading to
                 speedups of up to 50X. Our second contribution is the
                 Vector-Sparse format, which addresses the obstacles to
                 vectorization that stem from the commonly-used
                 Compressed-Sparse data structure. Our new format
                 eliminates unaligned memory accesses and bounds checks
                 within vector operations, two common problems when
                 processing low-degree vertices. Vectorization with
                 Vector-Sparse leads to speedups of up to 2.5X. Our
                 contributions are embodied in Grazelle, a hybrid graph
                 processing framework. On a server equipped with four
                 Intel Xeon E7-4850 v3 processors, Grazelle respectively
                 outperforms Ligra, Polymer, GraphMat, and X-Stream by
                 up to 15.2X, 4.6X, 4.7X, and 66.8X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Jangda:2018:EFT,
  author =       "Abhinav Jangda and Uday Bondhugula",
  title =        "An effective fusion and tile size model for optimizing
                 image processing pipelines",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "261--275",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178507",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Effective models for fusion of loop nests continue to
                 remain a challenge in both general-purpose and
                 domain-specific language (DSL) compilers. The
                 difficulty often arises from the combinatorial
                 explosion of grouping choices and their interaction
                 with parallelism and locality. This paper presents a
                 new fusion algorithm for high-performance
                 domain-specific compilers for image processing
                 pipelines. The fusion algorithm is driven by dynamic
                 programming and explores spaces of fusion possibilities
                 not covered by previous approaches, and is driven by a
                 cost function more concrete and precise in capturing
                 optimization criteria than prior approaches. The fusion
                 model is particularly tailored to the transformation
                 and optimization sequence applied by PolyMage and
                 Halide, two recent DSLs for image processing pipelines.
                 Our model-driven technique when implemented in PolyMage
                 provides significant improvements (up to 4.32X) over
                 PolyMage's approach (which uses auto-tuning to aid its
                 model), and over Halide's automatic approach (by up to
                 2.46X) on two state-of-the-art shared-memory multicore
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Wang:2018:LLD,
  author =       "Lei Wang and Liangji Zhuang and Junhang Chen and
                 Huimin Cui and Fang Lv and Ying Liu and Xiaobing Feng",
  title =        "{Lazygraph}: lazy data coherency for replicas in
                 distributed graph-parallel computation",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "276--289",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178508",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Replicas$^1$ of a vertex play an important role in
                 existing distributed graph processing systems which
                 make a single vertex to be parallel processed by
                 multiple machines and access remote neighbors locally
                 without any remote access. However, replicas of
                 vertices introduce data coherency problem. Existing
                 distributed graph systems treat replicas of a vertex v
                 as an atomic and indivisible vertex, and use an eager
                 data coherency approach to guarantee replicas
                 atomicity. In eager data coherency approach, any
                 changes to vertex data must be immediately communicated
                 to all replicas of v, thus leading to frequent global
                 synchronizations and communications. In this paper, we
                 propose a lazy data coherency approach, called
                 LazyAsync, which treats replicas of a vertex as
                 independent vertices and maintains the data coherency
                 by computations, rather than communications in existing
                 eager approach. Our approach automatically selects some
                 data coherency points from the graph algorithm, and
                 maintains all replicas to share the same global view
                 only at such points, which means the replicas are
                 enabled to maintain different local views between any
                 two adjacent data coherency points. Based on
                 PowerGraph, we develop a distributed graph processing
                 system LazyGraph to implement the LazyAsync approach
                 and exploit graph-aware optimizations. On a 48-node
                 EC2-like cluster, LazyGraph outperforms PowerGraph on
                 four widely used graph algorithms across a variety of
                 real-world graphs, with a speedup ranging from 1.25x to
                 10.69x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Sun:2018:PPA,
  author =       "Yihan Sun and Daniel Ferizovic and Guy E. Belloch",
  title =        "{PAM}: parallel augmented maps",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "290--304",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178509",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Ordered (key-value) maps are an important and
                 widely-used data type for large-scale data processing
                 frameworks. Beyond simple search, insertion and
                 deletion, more advanced operations such as range
                 extraction, filtering, and bulk updates form a critical
                 part of these frameworks. We describe an interface for
                 ordered maps that is augmented to support fast range
                 queries and sums, and introduce a parallel and
                 concurrent library called PAM (Parallel Augmented Maps)
                 that implements the interface. The interface includes a
                 wide variety of functions on augmented maps ranging
                 from basic insertion and deletion to more interesting
                 functions such as union, intersection, filtering,
                 extracting ranges, splitting, and range-sums. We
                 describe algorithms for these functions that are
                 efficient both in theory and practice. As examples of
                 the use of the interface and the performance of PAM we
                 apply the library to four applications: simple range
                 sums, interval trees, 2D range trees, and ranked word
                 index searching. The interface greatly simplifies the
                 implementation of these data structures over direct
                 implementations. Sequentially the code achieves
                 performance that matches or exceeds existing libraries
                 designed specially for a single application, and in
                 parallel our implementation gets speedups ranging from
                 40 to 90 on 72 cores with 2-way hyperthreading.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Fu:2018:ESM,
  author =       "Zhouwang Fu and Tao Song and Zhengwei Qi and Haibing
                 Guan",
  title =        "Efficient shuffle management with {SCache} for {DAG}
                 computing frameworks",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "305--316",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178510",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In large-scale data-parallel analytics, shuffle, or
                 the cross-network read and aggregation of partitioned
                 data between tasks with data dependencies, usually
                 brings in large overhead. To reduce shuffle overhead,
                 we present SCache, an open source plug-in system that
                 particularly focuses on shuffle optimization. By
                 extracting and analyzing shuffle dependencies prior to
                 the actual task execution, SCache can adopt heuristic
                 pre-scheduling combining with shuffle size prediction
                 to pre-fetch shuffle data and balance load on each
                 node. Meanwhile, SCache takes full advantage of the
                 system memory to accelerate the shuffle process. We
                 have implemented SCache and customized Spark to use it
                 as the external shuffle service and co-scheduler. The
                 performance of SCache is evaluated with both
                 simulations and testbed experiments on a 50-node Amazon
                 EC2 cluster. Those evaluations have demonstrated that,
                 by incorporating SCache, the shuffle overhead of Spark
                 can be reduced by nearly 89\%, and the overall
                 completion time of TPC-DS queries improves 40\% on
                 average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Li:2018:HPG,
  author =       "Xueqi Li and Guangming Tan and Bingchen Wang and
                 Ninghui Sun",
  title =        "High-performance genomic analysis framework with
                 in-memory computing",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "317--328",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178511",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we propose an in-memory computing
                 framework (called GPF) that provides a set of genomic
                 formats, APIs and a fast genomic engine for large-scale
                 genomic data processing. Our GPF comprises two main
                 components: (1) scalable genomic data formats and API.
                 (2) an advanced execution engine that supports
                 efficient compression of genomic data and eliminates
                 redundancies in the execution engine of our GPF. We
                 further present both system and algorithm-specific
                 implementations for users to build genomic analysis
                 pipeline without any acquaintance of Spark parallel
                 programming. To test the performance of GPF, we built a
                 WGS pipeline on top of our GPF as a test case. Our
                 experimental data indicate that GPF completes
                 Whole-Genome-Sequencing (WGS) analysis of 146.9G bases
                 Human Platinum Genome in running time of 24 minutes,
                 with over 50\% parallel efficiency when used on 2048
                 CPU cores. Together, our GPF framework provides a fast
                 and general engine for large-scale genomic data
                 processing which supports in-memory computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Liu:2018:GUC,
  author =       "Yang Liu and Jianguo Wang and Steven Swanson",
  title =        "{Griffin}: uniting {CPU} and {GPU} in information
                 retrieval systems for intra-query parallelism",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "327--337",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178512",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Interactive information retrieval services, such as
                 enterprise search and document search, must provide
                 relevant results with consistent, low response times in
                 the face of rapidly growing data sets and query loads.
                 These growing demands have led researchers to consider
                 a wide range of optimizations to reduce response
                 latency, including query processing parallelization and
                 acceleration with co-processors such as GPUs. However,
                 previous work runs queries either on GPU or CPU,
                 ignoring the fact that the best processor for a given
                 query depends on the query's characteristics, which may
                 change as the processing proceeds. We present Griffin,
                 an IR systems that dynamically combines GPU- and
                 CPU-based algorithms to process individual queries
                 according to their characteristics. Griffin uses
                 state-of-the-art CPU-based query processing techniques
                 and incorporates a novel approach to GPU-based query
                 evaluation. Our GPU-based approach, as far as we know,
                 achieves the best available GPU search performance by
                 leveraging a new compression scheme and exploiting an
                 advanced merge-based intersection algorithm. We
                 evaluate Griffin with real world queries and datasets,
                 and show that it improves query performance by 10x
                 compared to a highly optimized CPU-only implementation,
                 and 1.5x compared to our GPU-approach running alone. We
                 also find that Griffin helps reduce the 95th-, 99th-,
                 and 99.9th-percentile query response time by 10.4x,
                 16.1x, and 26.8x, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Wang:2018:SFS,
  author =       "Xinliang Wang and Weifeng Liu and Wei Xue and Li Wu",
  title =        "{swSpTRSV}: a fast sparse triangular solve with sparse
                 level tile layout on {Sunway} architectures",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "338--353",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178513",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Sparse triangular solve (SpTRSV) is one of the most
                 important kernels in many real-world applications.
                 Currently, much research on parallel SpTRSV focuses on
                 level-set construction for reducing the number of
                 inter-level synchronizations. However, the
                 out-of-control data reuse and high cost for global
                 memory or shared cache access in inter-level
                 synchronization have been largely neglected in existing
                 work. In this paper, we propose a novel data layout
                 called Sparse Level Tile to make all data reuse under
                 control, and design a Producer-Consumer pairing method
                 to make any inter-level synchronization only happen in
                 very fast register communication. We implement our data
                 layout and algorithms on an SW26010 many-core
                 processor, which is the main building-block of the
                 current world fastest supercomputer Sunway Taihulight.
                 The experimental results of testing all 2057 square
                 matrices from the Florida Matrix Collection show that
                 our method achieves an average speedup of 6.9 and the
                 best speedup of 38.5 over parallel level-set method.
                 Our method also outperforms the latest methods on a KNC
                 many-core processor in 1856 matrices and the latest
                 methods on a K80 GPU in 1672 matrices, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Wilcox:2018:VVH,
  author =       "James R. Wilcox and Cormac Flanagan and Stephen N.
                 Freund",
  title =        "{VerifiedFT}: a verified, high-performance precise
                 dynamic race detector",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "354--367",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178514",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic data race detectors are valuable tools for
                 testing and validating concurrent software, but to
                 achieve good performance they are typically implemented
                 using sophisticated concurrent algorithms. Thus, they
                 are ironically prone to the exact same kind of
                 concurrency bugs they are designed to detect. To
                 address these problems, we have developed VerifiedFT,
                 a clean slate redesign of the FastTrack race detector
                 [19]. The VerifiedFT analysis provides the same
                 precision guarantee as FastTrack, but is simpler to
                 implement correctly and efficiently, enabling us to
                 mechanically verify an implementation of its core
                 algorithm using CIVL [27]. Moreover, VerifiedFT
                 provides these correctness guarantees without
                 sacrificing any performance over current
                 state-of-the-art (but complex and unverified) FastTrack
                 implementations for Java.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Xu:2018:EPD,
  author =       "Yifan Xu and I-Ting Angelina Lee and Kunal Agrawal",
  title =        "Efficient parallel determinacy race detection for
                 two-dimensional dags",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "368--380",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178515",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A program is said to have a determinacy race if
                 logically parallel parts of a program access the same
                 memory location and one of the accesses is a write.
                 These races are generally bugs in the program since
                 they lead to non-deterministic program behavior
                 different schedules of the program can lead to
                 different results. Most prior work on detecting these
                 races focuses on a subclass of programs with fork-join
                 parallelism. This paper presents a race-detection
                 algorithm, 2D-Order, for detecting races in a more
                 general class of programs, namely programs whose
                 dependence structure can be represented as planar dags
                 embedded in 2D grids. Such dependence structures arise
                 from programs that use pipelined parallelism or dynamic
                 programming recurrences. Given a computation with $ T_1
                 $ work and $ T_\infty $ span, 2D-Order executes it
                 while also detecting races in $ O(T_1 / P + T_\infty) $
                 time on $P$ processors, which is asymptotically
                 optimal. We also implemented PRacer, a race-detection
                 algorithm based on 2D-Order for Cilk-P, which is a
                 language for expressing pipeline parallelism. Empirical
                 results demonstrate that PRacer incurs reasonable
                 overhead and exhibits scalability similar to the
                 baseline (executions without race detection) when
                 running on multiple cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Acar:2018:PCM,
  author =       "Umut A. Acar and Vitaly Aksenov and Arthur
                 Chargu{\'e}raud and Mike Rainey",
  title =        "Performance challenges in modular parallel programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "381--382",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178516",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Over the past decade, many programming languages and
                 systems for parallel-computing have been developed,
                 including Cilk, Fork/Join Java, Habanero Java, Parallel
                 Haskell, Parallel ML, and X10. Although these systems
                 raise the level of abstraction at which parallel code
                 are written, performance continues to require the
                 programmer to perform extensive optimizations and
                 tuning, often by taking various architectural details
                 into account. One such key optimization is granularity
                 control, which requires the programmer to determine
                 when and how parallel tasks should be sequentialized.
                 In this paper, we briefly describe some of the
                 challenges associated with automatic granularity
                 control when trying to achieve portable performance for
                 parallel programs with arbitrary nesting of parallel
                 constructs. We consider a result from the
                 functional-programming community, whose starting point
                 is to consider an ``oracle'' that can predict the work
                 of parallel codes, and thereby control granularity. We
                 discuss the challenges in implementing such an oracle
                 and proving that it has the desired theoretical
                 properties under the nested-parallel programming
                 model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Arif:2018:RBP,
  author =       "Mahwish Arif and Hans Vandierendonck",
  title =        "Reducing the burden of parallel loop schedulers for
                 many-core processors",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "383--384",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178517",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This work proposes a low-overhead half-barrier pattern
                 to schedule fine-grain parallel loops and considers its
                 integration in the Intel OpenMP and Cilkplus
                 schedulers. Experimental evaluation demonstrates that
                 the scheduling overhead of our techniques is 43\% lower
                 than Intel OpenMP and 12.1x lower than Cilk. We observe
                 22\% speedup on 48 threads, with a peak of 2.8x
                 speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Cohen:2018:RTA,
  author =       "Nachshon Cohen and Erez Petrank and James R. Larus",
  title =        "Reducing transaction aborts by looking to the future",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "385--386",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178518",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Transactions are widely used in database engines and
                 they becoming increasingly useful as a general
                 synchronization technique for multicore machines [1].
                 Transactional systems allow a programmer to encapsulate
                 multiple operations inside a transaction. All these
                 operations appear to be executed atomically or not at
                 all.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Correia:2018:STR,
  author =       "Andreia Correia and Pedro Ramalhete",
  title =        "Strong trylocks for reader-writer locks",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "387--388",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178519",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A reader-writer lock provides basic methods for shared
                 and exclusive lock acquisition. A thread calling one of
                 these methods may have to wait indefinitely to enter
                 its critical section, with no guarantee of completion.
                 We present two new reader-writer strong trylock
                 algorithms, where a call to a trylock method always
                 completes in a finite number of steps, and is
                 guaranteed to succeed unless there is a linearizable
                 history for which another thread has the lock. The
                 first algorithm, named StrongTryRW, uses a single word
                 of memory to reach consensus, thus yielding reduced
                 scalability for readers. To address read scalability,
                 we designed StrongTryRWRI which matches in throughput
                 the current state of the art reader-writer lock
                 algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Dong:2018:SSM,
  author =       "Yao Dong and Ana Milanova and Julian Dolby",
  title =        "{SecureMR}: secure mapreduce using homomorphic
                 encryption and program partitioning",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "389--390",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178520",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In cloud computing customers upload data and
                 computation to cloud providers. As they cede their data
                 to the cloud provider, they may cede data
                 confidentiality. We develop SecureMR, a system that
                 analyzes and transforms MapReduce programs to operate
                 over encrypted data. SecureMR makes use of partially
                 homomorphic encryption and a trusted client. We
                 evaluate SecureMR on a set of MapReduce benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Firoz:2018:SDV,
  author =       "Jesun Sahariar Firoz and Marcin Zalewski and Andrew
                 Lumsdaine",
  title =        "A scalable distance-1 vertex coloring algorithm for
                 power-law graphs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "391--392",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178521",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a distributed, unordered, label-correcting
                 distance-1 vertex coloring algorithm, called
                 Distributed Control (DC) coloring algorithm. DC
                 eliminates the need for vertex-centric barriers and
                 global synchronization for color refinement, relying
                 only on atomic operations and local termination
                 detection to update vertex color. We implement our DC
                 coloring algorithm and the well-known Jones-Plassmann
                 algorithm in the AM++ AMT runtime and compare their
                 performance. We show that, with runtime support, the
                 elimination of waiting time of vertex-centric barriers
                 and investing this time for local ordering results in
                 better execution time for power-law graphs with dense
                 local subgraphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Hayashi:2018:SMP,
  author =       "Koby Hayashi and Grey Ballard and Yujie Jiang and
                 Michael J. Tobia",
  title =        "Shared-memory parallelization of {MTTKRP} for dense
                 tensors",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "393--394",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178522",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The matricized-tensor times Khatri--Rao product
                 (MTTKRP) is the computational bottleneck for algorithms
                 computing CP decompositions of tensors. In this work,
                 we develop shared-memory parallel algorithms for MTTKRP
                 involving dense tensors. The algorithms cast nearly all
                 of the computation as matrix operations in order to use
                 optimized BLAS subroutines, and they avoid reordering
                 tensor entries in memory. We use our parallel
                 implementation to compute a CP decomposition of a
                 neuroimaging data set and achieve a speedup of up to
                 7.4X over existing parallel software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Jiang:2018:RPS,
  author =       "Peng Jiang and Gagan Agrawal",
  title =        "Revealing parallel scans and reductions in sequential
                 loops through function reconstruction",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "395--396",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178523",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many sequential loops are actually scans or reductions
                 and can be parallelized across iterations despite the
                 loop-carried dependences. In this work, we consider the
                 parallelization of such scan/reduction loops, and
                 propose a practical runtime approach called
                 sampling-and-reconstruction to extract the hidden
                 scan/reduction patterns in these loops.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Hong:2018:PMG,
  author =       "Changwan Hong and Aravind Sukumaran-Rajam and Jinsung
                 Kim and Prashant Singh Rawat and Sriram Krishnamoorthy
                 and Louis-No{\"e}l Pouchet and Fabrice Rastello and P.
                 Sadayappan",
  title =        "Performance modeling for {GPUs} using abstract kernel
                 emulation",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "397--398",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178524",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Performance modeling of GPU kernels is a significant
                 challenge. In this paper, we develop a novel approach
                 to performance modeling for GPUs through abstract
                 kernel emulation along with latency/gap modeling of
                 resources. Experimental results on all benchmarks from
                 the Rodinia suite demonstrate good accuracy in
                 predicting execution time on multiple GPU platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Jordan:2018:TCD,
  author =       "Herbert Jordan and Bernhard Scholz and Pavle Subotic",
  title =        "Two concurrent data structures for efficient datalog
                 query processing",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "399--400",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178525",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In recent years, Datalog has gained popularity for the
                 implementation of advanced data analysis. Applications
                 benefit from Datalog's high-level, declarative syntax,
                 and availability of efficient algorithms for computing
                 solutions. The efficiency of Datalog engines has
                 reached a point where engines such as Souffl{\'e} have
                 reported performance results comparable to low-level
                 hand-crafted alternatives [3].",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Kerbl:2018:SQW,
  author =       "Bernhard Kerbl and J{\"o}rg M{\"u}ller and Michael
                 Kenzel and Dieter Schmalstieg and Markus Steinberger",
  title =        "A scalable queue for work distribution on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "401--402",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178526",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Harnessing the power of massively parallel devices
                 like the graphics processing unit (GPU) is difficult
                 for algorithms that show dynamic or inhomogeneous
                 workloads. To achieve high performance, such advanced
                 algorithms require scalable, concurrent queues to
                 collect and distribute work. We present a new
                 concurrent work queue, the Broker Queue, a highly
                 efficient, linearizable queue for fine-granular work
                 distribution on the GPU. We evaluate its usability and
                 benefits in contrast to existing queuing algorithms.
                 Our queue is up to one order of magnitude faster than
                 non-blocking queues, and outperforms simpler queue
                 designs that are unfit for fine-granular work
                 distribution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Licht:2018:DSF,
  author =       "Johannes de Fine Licht and Michaela Blott and Torsten
                 Hoefler",
  title =        "Designing scalable {FPGA} architectures using
                 high-level synthesis",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "403--404",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178527",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Massive spatial parallelism at low energy gives FPGAs
                 the potential to be core components in large scale high
                 performance computing (HPC) systems. In this paper we
                 present four major design steps that harness high-level
                 synthesis (HLS) to implement scalable spatial FPGA
                 algorithms. To aid productivity, we introduce the open
                 source library hlslib to complement HLS. We evaluate
                 kernels designed with our approach on an FPGA
                 accelerator board, demonstrating high performance and
                 board utilization with enhanced programmer
                 productivity. By following our guidelines, programmers
                 can use HLS to develop efficient parallel algorithms
                 for FPGA, scaling their implementations with increased
                 resources on future hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Liu:2018:LLC,
  author =       "Bo Liu and Wenbin Jiang and Hai Jin and Xuanhua Shi
                 and Yang Ma",
  title =        "{Layrub}: layer-centric {GPU} memory reuse and data
                 migration in extreme-scale deep learning systems",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "405--406",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178528",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Growing accuracy and robustness of Deep Neural
                 Networks (DNN) models are accompanied by growing model
                 capacity (going deeper or wider). However, high memory
                 requirements of those models make it difficult to
                 execute the training process in one GPU. To address it,
                 we first identify the memory usage characteristics for
                 deep and wide convolutional networks, and demonstrate
                 the opportunities of memory reuse on both intra-layer
                 and inter-layer levels. We then present Layrub, a
                 runtime data placement strategy that orchestrates the
                 execution of training process. It achieves
                 layer-centric reuse to reduce memory consumption for
                 extreme-scale deep learning that cannot be run on one
                 single GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Liu:2018:RBI,
  author =       "Junhong Liu and Xin He and Weifeng Liu and Guangming
                 Tan",
  title =        "Register-based implementation of the sparse general
                 matrix--matrix multiplication on {GPUs}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "407--408",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178529",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "General sparse matrix--matrix multiplication (SpGEMM)
                 is an essential building block in a number of
                 applications. In our work, we fully utilize GPU
                 registers and shared memory to implement an efficient
                 and load balanced SpGEMM in comparison with the
                 existing implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Mururu:2018:QRE,
  author =       "Girish Mururu and Ada Gavrilovska and Santosh Pande",
  title =        "Quantifying and reducing execution variance in {STM}
                 via model driven commit optimization",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "409--410",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178530",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Simplified parallel programming coupled with an
                 ability to express speculative computation is realized
                 with Software Transactional Memory (STM). Although STMs
                 are gaining popularity because of significant
                 improvements in parallel performance, they exhibit
                 enormous variation in transaction execution with
                 non-repeatable performance behavior which is
                 unacceptable in many application domains, especially in
                 which frame rates and responsiveness should be
                 predictable. Thus, reducing execution variance in STM
                 is an important performance goal that has been mostly
                 overlooked. In this work, we minimize the variance in
                 execution time of threads in STM by reducing
                 non-determinism exhibited due to speculation by first
                 quantifying non-determinism and generating an automaton
                 that models the behavior of STM. We used the automaton
                 to guide the STM to a less non-deterministic execution
                 that reduced the variance in frame rate by a maximum of
                 65\% on a version of real-world Quake3 game.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Park:2018:TGM,
  author =       "Jungho Park and Hyungmin Cho and Wookeun Jung and
                 Jaejin Lee",
  title =        "Transparent {GPU} memory management for {DNNs}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "411--412",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178531",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern DNN frameworks exploit GPU acceleration by
                 default to achieve high performance. The limitation of
                 GPU memory capacity becomes a serious problem because
                 DNNs are becoming deeper and larger. This paper
                 proposes a purely software-based transparent solution,
                 called tvDNN, to the GPU memory capacity problem. It is
                 based on GPU memory swapping and memory object
                 sectioning techniques. It also provides an efficient
                 memory-object swapping schedule based on ILP (optimal)
                 and heuristics (suboptimal). The experimental results
                 show that tvDNN enables Caffe to build VGG-16 with a
                 large batch size, such as 256 or 512, using a few GB of
                 GPU memory without significant performance
                 degradation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "Distributed Neural Network (DNN)",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Poter:2018:SIA,
  author =       "Manuel P{\"o}ter and Jesper Larsson Tr{\"a}ff",
  title =        "Stamp-it, amortized constant-time memory reclamation
                 in comparison to five other schemes",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "413--414",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178532",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The memory reclamation problem is to determine, for
                 any given allocated memory node, when there are no more
                 references to the node, allowing it to be safely
                 returned to the memory management system. In a
                 concurrent context, the memory reclamation problem is
                 highly non-trivial, since there may be more than one
                 thread referencing an allocated node unbeknownst to the
                 other threads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Reif:2018:PSA,
  author =       "Stefan Reif and Wolfgang Schr{\"o}der-Preikschat",
  title =        "A predictable synchronisation algorithm",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "415--416",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178533",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Interaction with physical objects often imposes
                 latency requirements to multi-core embedded systems.
                 One consequence is the need for synchronisation
                 algorithms that provide predictable latency, in
                 addition to high throughput. We present a
                 synchronisation algorithm that needs at most 7 atomic
                 memory operations per asynchronous critical section.
                 The performance is competitive, at least, to locks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Riebler:2018:ACA,
  author =       "Heinrich Riebler and Gavin Vaz and Tobias Kenter and
                 Christian Plessl",
  title =        "Automated code acceleration targeting heterogeneous
                 {OpenCL} devices",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "417--418",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178534",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Accelerators can offer exceptional performance
                 advantages. However, programmers need to spend
                 considerable efforts on acceleration, without knowing
                 how sustainable the employed programming models,
                 languages and tools are. To tackle this challenge, we
                 propose and demonstrate a new runtime system called HT
                 rOP that is able to automatically generate and execute
                 OpenCL code from sequential CPU code. HTrOP transforms
                 suitable data-parallel loops into independent
                 OpenCL-typical work-items and handles concrete calls to
                 these devices through a mix of library components and
                 application-specific OpenCL host code. Computational
                 hotspots are identified and can be offloaded to
                 different resources (CPU, GPGPU and Xeon Phi). We
                 demonstrate the potential of HTrOP on a broad set of
                 applications and are able to improve the performance by
                 4.3X on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Barrera:2018:GPA,
  author =       "Isaac S{\'a}nchez Barrera and Marc Casas and Miquel
                 Moret{\'o} and Eduard Ayguad{\'e} and Jes{\'u}s Labarta
                 and Mateo Valero",
  title =        "Graph partitioning applied to {DAG} scheduling to
                 reduce {NUMA} effects",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "419--420",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178535",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The complexity of shared memory systems is becoming
                 more relevant as the number of memory domains
                 increases, with different access latencies and
                 bandwidth rates depending on the proximity between the
                 cores and the devices containing the data. In this
                 context, techniques to manage and mitigate non-uniform
                 memory access (NUMA) effects consist in migrating
                 threads, memory pages or both and are typically applied
                 by the system software. We propose techniques at the
                 runtime system level to reduce NUMA effects on parallel
                 applications. We leverage runtime system metadata in
                 terms of a task dependency graph. Our approach, based
                 on graph partitioning methods, is able to provide
                 parallel performance improvements of 1.12X on average
                 with respect to the state-of-the-art.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Volkov:2018:MSG,
  author =       "Vasily Volkov",
  title =        "A microbenchmark to study {GPU} performance models",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "421--422",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178536",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Basic microarchitectural features of NVIDIA GPUs have
                 been stable for a decade, and many analytic solutions
                 were proposed to model their performance. We present a
                 way to review, systematize, and evaluate these
                 approaches by using a microbenchmark. In this manner,
                 we produce a brief algebraic summary of key elements of
                 selected performance models, identify patterns in their
                 design, and highlight their previously unknown
                 limitations. Also, we identify a potentially superior
                 method for estimating performance based on classical
                 work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Zhao:2018:SCG,
  author =       "Tuowen Zhao and Mary Hall and Protonu Basu and Samuel
                 Williams and Hans Johansen",
  title =        "{SIMD} code generation for stencils on brick
                 decompositions",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "1",
  pages =        "423--424",
  month =        jan,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3200691.3178537",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a stencil library and associated compiler
                 code generation framework designed to maximize
                 performance on higher-order stencil computations
                 through the use of two main technologies: a
                 fine-grained brick data layout designed to exploit the
                 inherent multidimensional spatial locality endemic to
                 stencil computations, and a vector scatter associative
                 reordering transformation that reduces vector loads and
                 alignment operations and exposes opportunities for the
                 backend compiler to reduce computation. For a range of
                 stencil computations, we compare the generated code
                 expressed in the brick library to the standard tiled
                 code. We attain up to a 7.2X speedup on the most
                 complex stencils when running on an Intel Knights
                 Landing (Xeon Phi) processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PPoPP '18 proceedings.",
}

@Article{Fujiki:2018:MDP,
  author =       "Daichi Fujiki and Scott Mahlke and Reetuparna Das",
  title =        "In-Memory Data Parallel Processor",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "1--14",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173171",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recent developments in Non-Volatile Memories (NVMs)
                 have opened up a new horizon for in-memory computing.
                 Despite the significant performance gain offered by
                 computational NVMs, previous works have relied on
                 manual mapping of specialized kernels to the memory
                 arrays, making it infeasible to execute more general
                 workloads. We combat this problem by proposing a
                 programmable in-memory processor architecture and
                 data-parallel programming framework. The efficiency of
                 the proposed in-memory processor comes from two
                 sources: massive parallelism and reduction in data
                 movement. A compact instruction set provides
                 generalized computation capabilities for the memory
                 array. The proposed programming framework seeks to
                 leverage the underlying parallelism in the hardware by
                 merging the concepts of data-flow and vector
                 processing. To facilitate in-memory programming, we
                 develop a compilation framework that takes a TensorFlow
                 input and generates code for our in-memory processor.
                 Our results demonstrate 7.5x speedup over a multi-core
                 CPU server for a set of applications from Parsec and
                 763x speedup over a server-class GPU for a set of
                 Rodinia benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Fix:2018:HMT,
  author =       "Jordan Fix and Nayana P. Nagendra and Sotiris
                 Apostolakis and Hansen Zhang and Sophie Qiu and David
                 I. August",
  title =        "Hardware Multithreaded Transactions",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "15--29",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173172",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Speculation with transactional memory systems helps
                 programmers and compilers produce profitable
                 thread-level parallel programs. Prior work shows that
                 supporting transactions that can span multiple threads,
                 rather than requiring transactions be contained within
                 a single thread, enables new types of speculative
                 parallelization techniques for both programmers and
                 parallelizing compilers. Unfortunately, software
                 support for multi-threaded transactions (MTXs) comes
                 with significant additional inter-thread communication
                 overhead for speculation validation. This overhead can
                 make otherwise good parallelization unprofitable for
                 programs with sizeable read and write sets. Some
                 programs using these prior software MTXs overcame this
                 problem through significant efforts by expert
                 programmers to minimize these sets and optimize
                 communication, capabilities which compiler technology
                 has been unable to equivalently achieve. Instead, this
                 paper makes speculative parallelization less laborious
                 and more feasible through low-overhead speculation
                 validation, presenting the first complete design,
                 implementation, and evaluation of hardware MTXs. Even
                 with maximal speculation validation of every load and
                 store inside transactions of tens to hundreds of
                 millions of instructions, profitable parallelization of
                 complex programs can be achieved. Across 8 benchmarks,
                 this system achieves a geomean speedup of 99\% over
                 sequential execution on a multicore machine with 4
                 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Kumar:2018:BTF,
  author =       "Rakesh Kumar and Boris Grot and Vijay Nagarajan",
  title =        "Blasting through the Front-End Bottleneck with
                 {Shotgun}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "30--42",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173178",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The front-end bottleneck is a well-established problem
                 in server workloads owing to their deep software stacks
                 and large instruction working sets. Despite years of
                 research into effective L1-I and BTB prefetching,
                 state-of-the-art techniques force a trade-off between
                 performance and metadata storage costs. This work
                 introduces Shotgun, a BTB-directed front-end prefetcher
                 powered by a new BTB organization that maintains a
                 logical map of an application's instruction footprint,
                 which enables high-efficacy prefetching at low storage
                 cost. To map active code regions, Shotgun precisely
                 tracks an application's global control flow (e.g.,
                 function and trap routine entry points) and summarizes
                 local control flow within each code region. Because the
                 local control flow enjoys high spatial locality, with
                 most functions comprised of a handful of instruction
                 cache blocks, it lends itself to a compact region-based
                 encoding. Meanwhile, the global control flow is
                 naturally captured by the application's unconditional
                 branch working set (calls, returns, traps). Based on
                 these insights, Shotgun devotes the bulk of its BTB
                 capacity to branches responsible for the global control
                 flow and a spatial encoding of their target regions. By
                 effectively capturing a map of the application's
                 instruction footprint in the BTB, Shotgun enables
                 highly effective BTB-directed prefetching. Using a
                 storage budget equivalent to a conventional BTB,
                 Shotgun outperforms the state-of-the-art BTB-directed
                 front-end prefetcher by up to 14\% on a set of varied
                 commercial workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Besta:2018:SNL,
  author =       "Maciej Besta and Syed Minhaj Hassan and Sudhakar
                 Yalamanchili and Rachata Ausavarungnirun and Onur Mutlu
                 and Torsten Hoefler",
  title =        "Slim {NoC}: a Low-Diameter On-Chip Network Topology
                 for High Energy Efficiency and Scalability",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "43--55",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177158",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging chips with hundreds and thousands of cores
                 require networks with unprecedented energy/area
                 efficiency and scalability. To address this, we propose
                 Slim NoC (SN): a new on-chip network design that
                 delivers significant improvements in efficiency and
                 scalability compared to the state-of-the-art. The key
                 idea is to use two concepts from graph and number
                 theory, degree-diameter graphs combined with non-prime
                 finite fields, to enable the smallest number of ports
                 for a given core count. SN is inspired by
                 state-of-the-art off-chip topologies; it identifies and
                 distills their advantages for NoC settings while
                 solving several key issues that lead to significant
                 overheads on-chip. SN provides NoC-specific layouts,
                 which further enhance area/energy efficiency. We show
                 how to augment SN with state-of-the-art router
                 microarchitecture schemes such as Elastic Links, to
                 make the network even more scalable and efficient. Our
                 extensive experimental evaluations show that SN
                 outperforms both traditional low-radix topologies
                 (e.g., meshes and tori) and modern high-radix networks
                 (e.g., various Flattened Butterflies) in area, latency,
                 throughput, and static/dynamic power consumption for
                 both synthetic and real workloads. SN provides a
                 promising direction in scalable and energy-efficient
                 NoC topologies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Nguyen:2018:SCM,
  author =       "Khanh Nguyen and Lu Fang and Christian Navasca and
                 Guoqing Xu and Brian Demsky and Shan Lu",
  title =        "{Skyway}: Connecting Managed Heaps in Distributed Big
                 Data Systems",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "56--69",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173200",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Managed languages such as Java and Scala are
                 prevalently used in development of large-scale
                 distributed systems. Under the managed runtime, when
                 performing data transfer across machines, a task
                 frequently conducted in a Big Data system, the system
                 needs to serialize a sea of objects into a byte
                 sequence before sending them over the network. The
                 remote node receiving the bytes then deserializes them
                 back into objects. This process is both
                 performance-inefficient and labor-intensive: (1) object
                 serialization/deserialization makes heavy use of
                 reflection, an expensive runtime operation and/or (2)
                 serialization/deserialization functions need to be
                 hand-written and are error-prone. This paper presents
                 Skyway, a JVM-based technique that can directly connect
                 managed heaps of different (local or remote) JVM
                 processes. Under Skyway, objects in the source heap can
                 be directly written into a remote heap without changing
                 their formats. Skyway provides performance benefits to
                 any JVM-based system by completely eliminating the need
                 (1) of invoking serialization/deserialization
                 functions, thus saving CPU time, and (2) of requiring
                 developers to hand-write serialization functions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Wu:2018:EBJ,
  author =       "Mingyu Wu and Ziming Zhao and Haoyu Li and Heting Li
                 and Haibo Chen and Binyu Zang and Haibing Guan",
  title =        "{Espresso}: Brewing {Java} For More Non-Volatility
                 with Non-volatile Memory",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "70--83",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173201",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Fast, byte-addressable non-volatile memory (NVM)
                 embraces both near-DRAM latency and disk-like
                 persistence, which has generated considerable interests
                 to revolutionize system software stack and programming
                 models. However, it is less understood how NVM can be
                 combined with managed runtime like Java virtual machine
                 (JVM) to ease persistence management. This paper
                 proposes Espresso, a holistic extension to Java and its
                 runtime, to enable Java programmers to exploit NVM for
                 persistence management with high performance. Espresso
                 first provides a general persistent heap design called
                 Persistent Java Heap (PJH) to manage persistent data as
                 normal Java objects. The heap is then strengthened with
                 a recoverable mechanism to provide crash consistency
                 for heap metadata. Espresso further provides a new
                 abstraction called Persistent Java Object (PJO) to
                 provide an easy-to-use but safe persistence programming
                 model for programmers to persist application data.
                 Evaluation confirms that Espresso significantly
                 outperforms state-of-art NVM support for Java (i.e.,
                 JPA and PCJ) while being compatible to data structures
                 in existing Java programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Wang:2018:ECI,
  author =       "Wenwen Wang and Stephen McCamant and Antonia Zhai and
                 Pen-Chung Yew",
  title =        "Enhancing Cross-{ISA} {DBT} Through Automatically
                 Learned Translation Rules",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "84--97",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177160",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a novel approach for dynamic
                 binary translation (DBT) to automatically learn
                 translation rules from guest and host binaries compiled
                 from the same source code. The learned translation
                 rules are then verified via binary symbolic execution
                 and used in an existing DBT system, QEMU, to generate
                 more efficient host binary code. Experimental results
                 on SPEC CINT2006 show that the average time of learning
                 a translation rule is less than two seconds. With the
                 rules learned from a collection of benchmark programs
                 excluding the targeted program itself, an average 1.25X
                 performance speedup over QEMU can be achieved for SPEC
                 CINT2006. Moreover, the translation overhead introduced
                 by this rule-based approach is very small even for
                 short-running workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Rajadurai:2018:GSL,
  author =       "Sumanaruban Rajadurai and Jeffrey Bosboom and Weng-Fai
                 Wong and Saman Amarasinghe",
  title =        "{Gloss}: Seamless Live Reconfiguration and
                 Reoptimization of Stream Programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "98--112",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173170",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "An important class of applications computes on
                 long-running or infinite streams of data, often with
                 known fixed data rates. The latter is referred to as
                 synchronous data flow ~(SDF) streams. These stream
                 applications need to run on clusters or the cloud due
                 to the high performance requirement. Further, they
                 require live reconfiguration and reoptimization for
                 various reasons such as hardware maintenance, elastic
                 computation, or to respond to fluctuations in resources
                 or application workload. However, reconfiguration and
                 reoptimization without downtime while accurately
                 preserving program state in a distributed environment
                 is difficult. In this paper, we introduce Gloss, a
                 suite of compiler and runtime techniques for live
                 reconfiguration of distributed stream programs. Gloss,
                 for the first time, avoids periods of zero throughput
                 during the reconfiguration of both stateless and
                 stateful SDF based stream programs. Furthermore, unlike
                 other systems, Gloss globally reoptimizes and
                 completely recompiles the program during
                 reconfiguration. This permits it to reoptimize the
                 application for entirely new configurations that it may
                 not have encountered before. All these Gloss operations
                 happen in-situ, requiring no extra hardware resources.
                 We show how Gloss allows stream programs to reconfigure
                 and reoptimize with no downtime and minimal overhead,
                 and demonstrate the wider applicability of it via a
                 variety of experiments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Yoon:2018:FTB,
  author =       "Hongil Yoon and Jason Lowe-Power and Gurindar S.
                 Sohi",
  title =        "Filtering Translation Bandwidth with Virtual Caching",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "113--127",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173195",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous computing with GPUs integrated on the
                 same chip as CPUs is ubiquitous, and to increase
                 programmability many of these systems support virtual
                 address accesses from GPU hardware. However, this
                 entails address translation on every memory access. We
                 observe that future GPUs and workloads show very high
                 bandwidth demands (up to 4 accesses per cycle in some
                 cases) for shared address translation hardware due to
                 frequent private TLB misses. This greatly impacts
                 performance (32\% average performance degradation
                 relative to an ideal MMU). To mitigate this overhead,
                 we propose a software-agnostic, practical, GPU virtual
                 cache hierarchy. We use the virtual cache hierarchy as
                 an effective address translation bandwidth filter. We
                 observe many requests that miss in private TLBs find
                 corresponding valid data in the GPU cache hierarchy.
                 With a GPU virtual cache hierarchy, these TLB misses
                 can be filtered (i.e., virtual cache hits),
                 significantly reducing bandwidth demands for the shared
                 address translation hardware. In addition,
                 accelerator-specific attributes (e.g., less likelihood
                 of synonyms) of GPUs reduce the design complexity of
                 virtual caches, making a whole virtual cache hierarchy
                 (including a shared L2 cache) practical for GPUs. Our
                 evaluation shows that the entire GPU virtual cache
                 hierarchy effectively filters the high address
                 translation bandwidth, achieving almost the same
                 performance as an ideal MMU. We also evaluate L1-only
                 virtual cache designs and show that using a whole
                 virtual cache hierarchy obtains additional performance
                 benefits (1.31$ \times $ speedup on average).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Maleki:2018:AHP,
  author =       "Sepideh Maleki and Martin Burtscher",
  title =        "Automatic Hierarchical Parallelization of Linear
                 Recurrences",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "128--138",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173168",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Linear recurrences encompass many fundamental
                 computations including prefix sums and digital filters.
                 Later result values depend on earlier result values in
                 recurrences, making it a challenge to compute them in
                 parallel. We present a new work- and space-efficient
                 algorithm to compute linear recurrences that is
                 amenable to automatic parallelization and suitable for
                 hierarchical massively-parallel architectures such as
                 GPUs. We implemented our approach in a domain-specific
                 code generator that emits optimized CUDA code. Our
                 evaluation shows that, for standard prefix sums and
                 single-stage IIR filters, the generated code reaches
                 the throughput of memory copy for large inputs, which
                 cannot be surpassed. On higher-order prefix sums, it
                 performs nearly as well as the fastest handwritten code
                 from the literature. On tuple-based prefix sums and
                 digital filters, our automatically parallelized code
                 outperforms the fastest prior implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Ginsbach:2018:AML,
  author =       "Philip Ginsbach and Toomas Remmelg and Michel Steuwer
                 and Bruno Bodin and Christophe Dubach and Michael F. P.
                 O'Boyle",
  title =        "Automatic Matching of Legacy Code to Heterogeneous
                 {APIs}: an Idiomatic Approach",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "139--153",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173182",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Heterogeneous accelerators often disappoint. They
                 provide the prospect of great performance, but only
                 deliver it when using vendor specific optimized
                 libraries or domain specific languages. This requires
                 considerable legacy code modifications, hindering the
                 adoption of heterogeneous computing. This paper
                 develops a novel approach to automatically detect
                 opportunities for accelerator exploitation. We focus on
                 calculations that are well supported by established
                 APIs: sparse and dense linear algebra, stencil codes
                 and generalized reductions and histograms. We call them
                 idioms and use a custom constraint-based Idiom
                 Description Language (IDL) to discover them within user
                 code. Detected idioms are then mapped to BLAS
                 libraries, cuSPARSE and clSPARSE and two DSLs: Halide
                 and Lift. We implemented the approach in LLVM and
                 evaluated it on the NAS and Parboil sequential C/C++
                 benchmarks, where we detect 60 idiom instances. In
                 those cases where idioms are a significant part of the
                 sequential execution time, we generate code that
                 achieves 1.26x to over 20x speedup on integrated and
                 external GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Wang:2018:UAA,
  author =       "Shu Wang and Chi Li and Henry Hoffmann and Shan Lu and
                 William Sentosa and Achmad Imam Kistijantoro",
  title =        "Understanding and Auto-Adjusting Performance-Sensitive
                 Configurations",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "154--168",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173206",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern software systems are often equipped with
                 hundreds to thousands of configurations, many of which
                 greatly affect performance. Unfortunately, properly
                 setting these configurations is challenging for
                 developers due to the complex and dynamic nature of
                 system workload and environment. In this paper, we
                 first conduct an empirical study to understand
                 performance-sensitive configurations and the challenges
                 of setting them in the real-world. Guided by our study,
                 we design a systematic and general control-theoretic
                 framework, SmartConf, to automatically set and
                 dynamically adjust performance-sensitive configurations
                 to meet required operating constraints while optimizing
                 other performance metrics. Evaluation shows that
                 SmartConf is effective in solving real-world
                 configuration problems, often providing better
                 performance than even the best static configuration
                 developers can choose under existing configuration
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Rahmani:2018:SFS,
  author =       "Amir M. Rahmani and Bryan Donyanavard and Tiago
                 M{\"u}ck and Kasra Moazzemi and Axel Jantsch and Onur
                 Mutlu and Nikil Dutt",
  title =        "{SPECTR}: Formal Supervisory Control and Coordination
                 for Many-core Systems Resource Management",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "169--183",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173199",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Resource management strategies for many-core systems
                 need to enable sharing of resources such as power,
                 processing cores, and memory bandwidth while
                 coordinating the priority and significance of system-
                 and application-level objectives at runtime in a
                 scalable and robust manner. State-of-the-art approaches
                 use heuristics or machine learning for resource
                 management, but unfortunately lack formalism in
                 providing robustness against unexpected corner cases.
                 While recent efforts deploy classical control-theoretic
                 approaches with some guarantees and formalism, they
                 lack scalability and autonomy to meet changing runtime
                 goals. We present SPECTR, a new resource management
                 approach for many-core systems that leverages formal
                 supervisory control theory (SCT) to combine the
                 strengths of classical control theory with
                 state-of-the-art heuristic approaches to efficiently
                 meet changing runtime goals. SPECTR is a scalable and
                 robust control architecture and a systematic design
                 flow for hierarchical control of many-core systems.
                 SPECTR leverages SCT techniques such as gain scheduling
                 to allow autonomy for individual controllers. It
                 facilitates automatic synthesis of the high-level
                 supervisory controller and its property verification.
                 We implement SPECTR on an Exynos platform containing
                 ARM's big.LITTLE-based heterogeneous multi-processor
                 (HMP) and demonstrate that SPECTR's use of SCT is key
                 to managing multiple interacting resources (e.g., chip
                 power and processing cores) in the presence of
                 competing objectives (e.g., satisfying QoS vs. power
                 capping). The principles of SPECTR are easily
                 applicable to any resource type and objective as long
                 as the management problem can be modeled using
                 dynamical systems theory (e.g., difference equations),
                 discrete-event dynamic systems, or fuzzy dynamics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Mishra:2018:CLC,
  author =       "Nikita Mishra and Connor Imes and John D. Lafferty and
                 Henry Hoffmann",
  title =        "{CALOREE}: Learning Control for Predictable Latency
                 and Low Energy",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "184--198",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173184",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many modern computing systems must provide reliable
                 latency with minimal energy. Two central challenges
                 arise when allocating system resources to meet these
                 conflicting goals: (1) complexity modern hardware
                 exposes diverse resources with complicated interactions
                 and (2) dynamics latency must be maintained despite
                 unpredictable changes in operating environment or
                 input. Machine learning accurately models the latency
                 of complex, interacting resources, but does not address
                 system dynamics; control theory adjusts to dynamic
                 changes, but struggles with complex resource
                 interaction. We therefore propose CALOREE, a resource
                 manager that learns key control parameters to meet
                 latency requirements with minimal energy in complex,
                 dynamic environments. CALOREE breaks resource
                 allocation into two sub-tasks: learning how interacting
                 resources affect speedup, and controlling speedup to
                 meet latency requirements with minimal energy. CALOREE
                 defines a general control system whose parameters are
                 customized by a learning framework while maintaining
                 control-theoretic formal guarantees that the latency
                 goal will be met. We test CALOREE's ability to deliver
                 reliable latency on heterogeneous ARM big.LITTLE
                 architectures in both single and multi-application
                 scenarios. Compared to the best prior learning and
                 control solutions, CALOREE reduces deadline misses by
                 60\% and energy consumption by 13\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Turakhia:2018:DGC,
  author =       "Yatish Turakhia and Gill Bejerano and William J.
                 Dally",
  title =        "{Darwin}: a Genomics Co-processor Provides up to $ 15
                 \, 000 \times $ Acceleration on Long Read Assembly",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "199--213",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173193",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Genomics is transforming medicine and our
                 understanding of life in fundamental ways. Genomics
                 data, however, is far outpacing Moore's Law.
                 Third-generation sequencing technologies produce 100X
                 longer reads than second generation technologies and
                 reveal a much broader mutation spectrum of disease and
                 evolution. However, these technologies incur
                 prohibitively high computational costs. Over 1,300 CPU
                 hours are required for reference-guided assembly of the
                 human genome, and over 15,600 CPU hours are required
                 for de novo assembly. This paper describes ``Darwin''
                 --- a co-processor for genomic sequence alignment that,
                 without sacrificing sensitivity, provides up to $ 15 \,
                 000 \times $ speedup over the state-of-the-art software
                 for reference-guided assembly of third-generation
                 reads. Darwin achieves this speedup through
                 hardware/algorithm co-design, trading more easily
                 accelerated alignment for less memory-intensive
                 filtering, and by optimizing the memory system for
                 filtering. Darwin combines a hardware-accelerated
                 version of D-SOFT, a novel filtering algorithm,
                 alignment at high speed, and with a
                 hardware-accelerated version of GACT, a novel alignment
                 algorithm. GACT generates near-optimal alignments of
                 arbitrarily long genomic sequences using constant
                 memory for the compute-intensive step. Darwin is
                 adaptable, with tunable speed and sensitivity to match
                 emerging sequencing technologies and to meet the
                 requirements of genomic applications beyond read
                 assembly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Zha:2018:LSM,
  author =       "Yue Zha and Jing Li",
  title =        "{Liquid Silicon-Monona}: a Reconfigurable
                 Memory-Oriented Computing Fabric with Scalable
                 Multi-Context Support",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "214--228",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173167",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "With the recent trend of promoting Field-Programmable
                 Gate Arrays (FPGAs) to first-class citizens in
                 accelerating compute-intensive applications in
                 networking, cloud services and artificial intelligence,
                 FPGAs face two major challenges in sustaining
                 competitive advantages in performance and energy
                 efficiency for diverse cloud workloads: (1) limited
                 configuration capability for supporting light-weight
                 computations/on-chip data storage to accelerate
                 emerging search-/data-intensive applications. (2) lack
                 of architectural support to hide reconfiguration
                 overhead for assisting virtualization in a cloud
                 computing environment. In this paper, we propose a
                 reconfigurable memory-oriented computing fabric, namely
                 Liquid Silicon-Monona (L-Si), enabled by emerging
                 nonvolatile memory technology i.e. RRAM, to address
                 these two challenges. Specifically, L-Si addresses the
                 first challenge by virtue of a new architecture
                 comprising a 2D array of physically identical but
                 functionally-configurable building blocks. It, for the
                 first time, extends the configuration capabilities of
                 existing FPGAs from computation to the whole spectrum
                 ranging from computation to data storage. It allows
                 users to better customize hardware by flexibly
                 partitioning hardware resources between computation and
                 memory, greatly benefiting emerging search- and
                 data-intensive applications. To address the second
                 challenge, L-Si provides scalable multi-context
                 architectural support to minimize reconfiguration
                 overhead for assisting virtualization. In addition, we
                 provide compiler support to facilitate the programming
                 of applications written in high-level programming
                 languages (e.g. OpenCL) and frameworks (e.g.
                 TensorFlow, MapReduce) while fully exploiting the
                 unique architectural capability of L-Si. Our evaluation
                 results show L-Si achieves 99.6\% area reduction, 1.43$
                 \times $ throughput improvement and 94.0\% power
                 reduction on search-intensive benchmarks, as compared
                 with the FPGA baseline. For neural network benchmarks,
                 on average, L-Si achieves 52.3$ \times $ speedup,
                 113.9$ \times $ energy reduction and 81\% area
                 reduction over the FPGA baseline. In addition, the
                 multi-context architecture of L-Si reduces the context
                 switching time to --- 10ns, compared with an
                 off-the-shelf FPGA (~100ms), greatly facilitating
                 virtualization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Achour:2018:TDC,
  author =       "Sara Achour and Martin Rinard",
  title =        "Time Dilation and Contraction for Programmable Analog
                 Devices with {Jaunt}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "229--242",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173179",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programmable analog devices are a powerful new
                 computing substrate that are especially appropriate for
                 performing computationally intensive simulations of
                 neuromorphic and cytomorphic models. Current state of
                 the art techniques for configuring analog devices to
                 simulate dynamical systems do not consider the current
                 and voltage operating ranges of analog device
                 components or the sampling limitations of the digital
                 interface of the device. We present Jaunt, a new solver
                 that scales the values that configure the analog device
                 to ensure the resulting analog computation executes
                 within the operating constraints of the device,
                 preserves the recoverable dynamics of the original
                 simulation, and executes slowly enough to observe these
                 dynamics at the sampled digital outputs. Our results
                 show that, on a set of benchmark biological
                 simulations, (1) unscaled configurations produce
                 incorrect simulations because they violate the
                 operating ranges of the device and (2) Jaunt delivers
                 scaled configurations that respect the operating ranges
                 to produce correct simulations with observable
                 dynamics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Dai:2018:EDT,
  author =       "Yuting Dai and Tao Li and Benyong Liu and Mingcong
                 Song and Huixiang Chen",
  title =        "Exploiting Dynamic Thermal Energy Harvesting for
                 Reusing in {Smartphone} with Mobile Applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "243--256",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173188",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Recently, mobile applications have gradually become
                 performance- and resource- intensive, which results in
                 a massive battery power drain and high surface
                 temperature, and further degrades the user experience.
                 Thus, high power consumption and surface over-heating
                 have been considered as a severe challenge to
                 smartphone design. In this paper, we propose DTEHR, a
                 mobile Dynamic Thermal Energy Harvesting Reusing
                 framework to tackle this challenge. The approach is
                 sustainable in that it generates energy using dynamic
                 Thermoelectric Generators (TEGs). The generated energy
                 not only powers Thermoelectric Coolers (TECs) for
                 cooling down hot-spots, but also recharges
                 micro-supercapacitors (MSCs) for extended smartphone
                 usage. To analyze thermal characteristics and evaluate
                 DTEHR across real-world applications, we build MPPTAT
                 (Multi-comPonent Power and Thermal Analysis Tool), a
                 power and thermal analyzing tool for Android. The
                 result shows that DTEHR reduces the temperature
                 differences between hot areas and cold areas up to
                 15.4${}^\circ $C (internal) and 7${}^\circ $C
                 (surface). With TEC-based hot-spots cooling, DTEHR
                 reduces the temperature of the surface and internal
                 hot-spots by an average of 8${}^\circ $ and 12.8mW
                 respectively. With dynamic TEGs, DTEHR generates
                 2.7-15mW power, more than hundreds of times of power
                 that TECs need to cool down hot-spots. Thus,
                 extra-generated power can be stored into MSCs to
                 prolong battery life.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Hu:2018:SDE,
  author =       "Yongjian Hu and Iulian Neamtiu",
  title =        "Static Detection of Event-based Races in {Android}
                 Apps",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "257--270",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173173",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Event-based races are the main source of concurrency
                 errors in Android apps. Prior approaches for scalable
                 detection of event-based races have been dynamic. Due
                 to their dynamic nature, these approaches suffer from
                 coverage and false negative issues. We introduce a
                 precise and scalable static approach and tool, named
                 SIERRA, for detecting Android event-based races. SIERRA
                 is centered around a new concept of ``concurrency
                 action'' (that reifies threads, events/messages, system
                 and user actions) and statically-derived order
                 (happens-before relation) between actions. Establishing
                 action order is complicated in Android, and event-based
                 systems in general, because of externally-orchestrated
                 control flow, use of callbacks, asynchronous tasks, and
                 ad-hoc synchronization. We introduce several novel
                 approaches that enable us to infer order relations
                 statically: auto-generated code models which impose
                 order among lifecycle and GUI events; a novel context
                 abstraction for event-driven programs named
                 action-sensitivity and finally, on-demand path
                 sensitivity via backward symbolic execution to further
                 rule out false positives. We have evaluated SIERRA on
                 194 Android apps. Of these, we chose 20 apps for manual
                 analysis and comparison with a state-of-the-art dynamic
                 race detector. Experimental results show that SIERRA is
                 effective and efficient, typically taking 960 seconds
                 to analyze an app and revealing 43 potential races.
                 Compared with the dynamic race detector, SIERRA
                 discovered an average 29.5 true races with 3.5 false
                 positives, where the dynamic detector only discovered 4
                 races (hence missing 25.5 races per app) --- this
                 demonstrates the advantage of a precise static
                 approach. We believe that our approach opens the way
                 for precise analysis and static event race detection in
                 other event-driven systems beyond Android.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Guo:2018:PCA,
  author =       "Peizhen Guo and Wenjun Hu",
  title =        "{Potluck}: Cross-Application Approximate Deduplication
                 for Computation-Intensive Mobile Applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "271--284",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173185",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging mobile applications, such as cognitive
                 assistance and augmented reality (AR) based gaming, are
                 increasingly computation-intensive and
                 latency-sensitive, while running on
                 resource-constrained devices. The standard approaches
                 to addressing these involve either offloading to a
                 cloud(let) or local system optimizations to speed up
                 the computation, often trading off computation quality
                 for low latency. Instead, we observe that these
                 applications often operate on similar input data from
                 the camera feed and share common processing components,
                 both within the same (type of) applications and across
                 different ones. Therefore, deduplicating processing
                 across applications could deliver the best of both
                 worlds. In this paper, we present Potluck, to achieve
                 approximate deduplication. At the core of the system is
                 a cache service that stores and shares processing
                 results between applications and a set of algorithms to
                 process the input data to maximize deduplication
                 opportunities. This is implemented as a background
                 service on Android. Extensive evaluation shows that
                 Potluck can reduce the processing latency for our AR
                 and vision workloads by a factor of 2.5 to 10.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Chong:2018:QCG,
  author =       "Frederic T. Chong",
  title =        "Quantum Computing is Getting Real: Architecture, {PL},
                 and {OS} Roles in Closing the Gap between Quantum
                 Algorithms and Machines",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "285--285",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177152",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Quantum computing is at an inflection point, where
                 50-qubit (quantum bit) machines have been built,
                 100-qubit machines are just around the corner, and even
                 1000-qubit machines are perhaps only a few years away.
                 These machines have the potential to fundamentally
                 change our concept of what is computable and
                 demonstrate practical applications in areas such as
                 quantum chemistry, optimization, and quantum
                 simulation. Yet a significant resource gap remains
                 between practical quantum algorithms and real machines.
                 There is an urgent shortage of the necessary computer
                 scientists to work on software and architectures to
                 close this gap. I will outline several grand research
                 challenges in closing this gap, including programming
                 language design, software and hardware verification,
                 defining and perforating abstraction boundaries,
                 cross-layer optimization, managing parallelism and
                 communication, mapping and scheduling computations,
                 reducing control complexity, machine-specific
                 optimizations, learning error patterns, and many more.
                 I will also describe the resources and infrastructure
                 available for starting research in quantum computing
                 and for tackling these challenges.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{DeLozier:2018:SSO,
  author =       "Christian DeLozier and Ariel Eizenberg and Brandon
                 Lucia and Joseph Devietti",
  title =        "{SOFRITAS}: Serializable Ordering-Free Regions for
                 Increasing Thread Atomicity Scalably",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "286--300",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173192",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Correctly synchronizing multithreaded programs is
                 challenging and errors can lead to program failures
                 such as atomicity violations. Existing strong memory
                 consistency models rule out some possible failures, but
                 are limited by depending on programmer-defined locking
                 code. We present the new Ordering-Free Region (OFR)
                 serializability consistency model that ensures
                 atomicity for OFRs, which are spans of dynamic
                 instructions between consecutive ordering constructs
                 (e.g., barriers), without breaking atomicity at lock
                 operations. Our platform, Serializable Ordering-Free
                 Regions for Increasing Thread Atomicity Scalably
                 (SOFRITAS), ensures a C/C++ program's execution is
                 equivalent to a serialization of OFRs by default. We
                 build two systems that realize the SOFRITAS idea: a
                 concurrency bug finding tool for testing called
                 SOFRITEST, and a production runtime system called
                 SOPRO. SOFRITEST uses OFRs to find concurrency bugs,
                 including a multi-critical-section atomicity violation
                 in memcached that weaker consistency models will miss.
                 If OFR's are too coarse-grained, SOFRITEST suggests
                 refinement annotations automatically. Our software-only
                 SOPRO implementation has high performance, scales well
                 with increased parallelism, and prevents failures
                 despite bugs in locking code. SOFRITAS has an average
                 overhead of just 1.59x on a single-threaded execution
                 and 1.51x on sixteen threads, despite pthreads' much
                 weaker memory model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Markuze:2018:DOF,
  author =       "Alex Markuze and Igor Smolyar and Adam Morrison and
                 Dan Tsafrir",
  title =        "{DAMN}: Overhead-Free {IOMMU} Protection for
                 Networking",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "301--315",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173175",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "DMA operations can access memory buffers only if they
                 are ``mapped'' in the IOMMU, so operating systems
                 protect themselves against malicious/errant network
                 DMAs by mapping and unmapping each packet immediately
                 before/after it is DMAed. This approach was recently
                 found to be riskier and less performant than keeping
                 packets non-DMAable and instead copying their content
                 to/from permanently-mapped buffers. Still, the extra
                 copy hampers performance of multi-gigabit networking.
                 We observe that achieving protection at the DMA (un)map
                 boundary is needlessly constraining, as devices must be
                 prevented from changing the data only after the kernel
                 reads it. So there is no real need to switch ownership
                 of buffers between kernel and device at the DMA
                 (un)mapping layer, as opposed to the approach taken by
                 all existing IOMMU protection schemes. We thus
                 eliminate the extra copy by (1)~implementing a new
                 allocator called DMA-Aware Malloc for Networking
                 (DAMN), which (de)allocates packet buffers from a
                 memory pool permanently mapped in the IOMMU;
                 (2)~modifying the network stack to use this allocator;
                 and (3)~copying packet data only when the kernel needs
                 it, which usually morphs the aforementioned extra copy
                 into the kernel's standard copy operation performed at
                 the user-kernel boundary. DAMN thus provides full IOMMU
                 protection with performance comparable to that of an
                 unprotected system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Boroumand:2018:GWC,
  author =       "Amirali Boroumand and Saugata Ghose and Youngsok Kim
                 and Rachata Ausavarungnirun and Eric Shiu and Rahul
                 Thakur and Daehyun Kim and Aki Kuusela and Allan Knies
                 and Parthasarathy Ranganathan and Onur Mutlu",
  title =        "{Google} Workloads for Consumer Devices: Mitigating
                 Data Movement Bottlenecks",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "316--331",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173177",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We are experiencing an explosive growth in the number
                 of consumer devices, including smartphones, tablets,
                 web-based computers such as Chromebooks, and wearable
                 devices. For this class of devices, energy efficiency
                 is a first-class concern due to the limited battery
                 capacity and thermal power budget. We find that data
                 movement is a major contributor to the total system
                 energy and execution time in consumer devices. The
                 energy and performance costs of moving data between the
                 memory system and the compute units are significantly
                 higher than the costs of computation. As a result,
                 addressing data movement is crucial for consumer
                 devices. In this work, we comprehensively analyze the
                 energy and performance impact of data movement for
                 several widely-used Google consumer workloads: (1) the
                 Chrome web browser; (2) TensorFlow Mobile, Google's
                 machine learning framework; (3) video playback, and (4)
                 video capture, both of which are used in many video
                 services such as YouTube and Google Hangouts. We find
                 that processing-in-memory (PIM) can significantly
                 reduce data movement for all of these workloads, by
                 performing part of the computation close to memory.
                 Each workload contains simple primitives and functions
                 that contribute to a significant amount of the overall
                 data movement. We investigate whether these primitives
                 and functions are feasible to implement using PIM,
                 given the limited area and power constraints of
                 consumer devices. Our analysis shows that offloading
                 these primitives to PIM logic, consisting of either
                 simple cores or specialized accelerators, eliminates a
                 large amount of data movement, and significantly
                 reduces total system energy (by an average of 55.4\%
                 across the workloads) and execution time (by an average
                 of 54.2\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Wen:2018:WSI,
  author =       "Shasha Wen and Xu Liu and John Byrne and Milind
                 Chabbi",
  title =        "Watching for Software Inefficiencies with {Witch}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "332--347",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177159",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Inefficiencies abound in complex, layered software. A
                 variety of inefficiencies show up as wasteful memory
                 operations. Many existing tools instrument every load
                 and store instruction to monitor memory, which
                 significantly slows execution and consumes enormously
                 extra memory. Our lightweight framework, Witch, samples
                 consecutive accesses to the same memory location by
                 exploiting two ubiquitous hardware features: the
                 performance monitoring units (PMU) and debug registers.
                 Witch performs no instrumentation. Hence,
                 witchcraft---tools built atop Witch---can detect a
                 variety of software inefficiencies while introducing
                 negligible slowdown and insignificant memory
                 consumption and yet maintaining accuracy comparable to
                 exhaustive instrumentation tools. Witch allowed us to
                 scale our analysis to a large number of code bases.
                 Guided by witchcraft, we detected several performance
                 problems in important code bases; eliminating these
                 inefficiencies resulted in significant speedups.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Devecsery:2018:OHA,
  author =       "David Devecsery and Peter M. Chen and Jason Flinn and
                 Satish Narayanasamy",
  title =        "Optimistic Hybrid Analysis: Accelerating Dynamic
                 Analysis through Predicated Static Analysis",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "348--362",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177153",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic analysis tools, such as those that detect
                 data-races, verify memory safety, and identify
                 information flow, have become a vital part of testing
                 and debugging complex software systems. While these
                 tools are powerful, their slow speed often limits how
                 effectively they can be deployed in practice. Hybrid
                 analysis speeds up these tools by using static analysis
                 to decrease the work performed during dynamic analysis.
                 In this paper we argue that current hybrid analysis is
                 needlessly hampered by an incorrect assumption that
                 preserving the soundness of dynamic analysis requires
                 an underlying sound static analysis. We observe that,
                 even with unsound static analysis, it is possible to
                 achieve sound dynamic analysis for the executions which
                 fall within the set of states statically considered.
                 This leads us to a new approach, called optimistic
                 hybrid analysis. We first profile a small set of
                 executions and generate a set of likely invariants that
                 hold true during most, but not necessarily all,
                 executions. Next, we apply a much more precise, but
                 unsound, static analysis that assumes these invariants
                 hold true. Finally, we run the resulting dynamic
                 analysis speculatively while verifying whether the
                 assumed invariants hold true during that particular
                 execution; if not, the program is reexecuted with a
                 traditional hybrid analysis. Optimistic hybrid analysis
                 is as precise and sound as traditional dynamic
                 analysis, but is typically much faster because (1)
                 unsound static analysis can speed up dynamic analysis
                 much more than sound static analysis can and (2)
                 verifications rarely fail. We apply optimistic hybrid
                 analysis to race detection and program slicing and
                 achieve 1.8x over a state-of-the-art race detector
                 (FastTrack) optimized with traditional hybrid analysis
                 and 8.3x over a hybrid backward slicer (Giri).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Katz:2018:SRC,
  author =       "Omer Katz and Noam Rinetzky and Eran Yahav",
  title =        "Statistical Reconstruction of Class Hierarchies in
                 Binaries",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "363--376",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173202",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We address a fundamental problem in reverse
                 engineering of object-oriented code: the reconstruction
                 of a program's class hierarchy from its stripped
                 binary. Existing approaches rely heavily on structural
                 information that is not always available, e.g., calls
                 to parent constructors. As a result, these approaches
                 often leave gaps in the hierarchies they construct, or
                 fail to construct them altogether. Our main insight is
                 that behavioral information can be used to infer
                 subclass/superclass relations, supplementing any
                 missing structural information. Thus, we propose the
                 first statistical approach for static reconstruction of
                 class hierarchies based on behavioral similarity. We
                 capture the behavior of each type using a statistical
                 language model (SLM), define a metric for pairwise
                 similarity between types based on the Kullback--Leibler
                 divergence between their SLMs, and lift it to determine
                 the most likely class hierarchy. We implemented our
                 approach in a tool called ROCK and used it to
                 automatically reconstruct the class hierarchies of
                 several real-world stripped C++ binaries. Our results
                 demonstrate that ROCK obtained significantly more
                 accurate class hierarchies than those obtained using
                 structural analysis alone.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Rigger:2018:STA,
  author =       "Manuel Rigger and Roland Schatz and Ren{\'e} Mayrhofer
                 and Matthias Grimmer and Hanspeter M{\"o}ssenb{\"o}ck",
  title =        "{Sulong}, and Thanks for All the Bugs: Finding Errors
                 in {C} Programs by Abstracting from the Native
                 Execution Model",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "377--391",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173174",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In C, memory errors, such as buffer overflows, are
                 among the most dangerous software errors; as we show,
                 they are still on the rise. Current dynamic bug-finding
                 tools that try to detect such errors are based on the
                 low-level execution model of the underlying machine.
                 They insert additional checks in an ad-hoc fashion,
                 which makes them prone to omitting checks for corner
                 cases. To address this, we devised a novel approach to
                 finding bugs during the execution of a program. At the
                 core of this approach is an interpreter written in a
                 high-level language that performs automatic checks
                 (such as bounds, NULL, and type checks). By mapping
                 data structures in C to those of the high-level
                 language, accesses are automatically checked and bugs
                 discovered. We have implemented this approach and show
                 that our tool (called Safe Sulong) can find bugs that
                 state-of-the-art tools overlook, such as out-of-bounds
                 accesses to the main function arguments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{David:2018:FPS,
  author =       "Yaniv David and Nimrod Partush and Eran Yahav",
  title =        "{FirmUp}: Precise Static Detection of Common
                 Vulnerabilities in Firmware",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "392--404",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177157",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a static, precise, and scalable technique
                 for finding CVEs (Common Vulnerabilities and Exposures)
                 in stripped firmware images. Our technique is able to
                 efficiently find vulnerabilities in real-world firmware
                 with high accuracy. Given a vulnerable procedure in an
                 executable binary and a firmware image containing
                 multiple stripped binaries, our goal is to detect
                 possible occurrences of the vulnerable procedure in the
                 firmware image. Due to the variety of architectures and
                 unique tool chains used by vendors, as well as the
                 highly customized nature of firmware, identifying
                 procedures in stripped firmware is extremely
                 challenging. Vulnerability detection requires not only
                 pairwise similarity between procedures but also
                 information about the relationships between procedures
                 in the surrounding executable. This observation serves
                 as the foundation for a novel technique that
                 establishes a partial correspondence between procedures
                 in the two binaries. We implemented our technique in a
                 tool called FirmUp and performed an extensive
                 evaluation over 40 million procedures, over 4 different
                 prevalent architectures, crawled from public vendor
                 firmware images. We discovered 373 vulnerabilities
                 affecting publicly available firmware, 147 of them in
                 the latest available firmware version for the device. A
                 thorough comparison of FirmUp to previous methods shows
                 that it accurately and effectively finds
                 vulnerabilities in firmware, while outperforming the
                 detection rate of the state of the art by 45\% on
                 average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Alglave:2018:FSC,
  author =       "Jade Alglave and Luc Maranget and Paul E. McKenney and
                 Andrea Parri and Alan Stern",
  title =        "Frightening Small Children and Disconcerting
                 Grown-ups: Concurrency in the {Linux} Kernel",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "405--418",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177156",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/linux.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  abstract =     "Concurrency in the Linux kernel can be a contentious
                 topic. The Linux kernel mailing list features numerous
                 discussions related to consistency models, including
                 those of the more than 30 CPU architectures supported
                 by the kernel and that of the kernel itself. How are
                 Linux programs supposed to behave? Do they behave
                 correctly on exotic hardware? A formal model can help
                 address such questions. Better yet, an executable model
                 allows programmers to experiment with the model to
                 develop their intuition. Thus we offer a model written
                 in the cat language, making it not only formal, but
                 also executable by the herd simulator. We tested our
                 model against hardware and refined it in consultation
                 with maintainers. Finally, we formalised the
                 fundamental law of the Read-Copy-Update synchronisation
                 mechanism, and proved that one of its implementations
                 satisfies this law.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Liu:2018:FAD,
  author =       "Haopeng Liu and Xu Wang and Guangpu Li and Shan Lu and
                 Feng Ye and Chen Tian",
  title =        "{FCatch}: Automatically Detecting Time-of-fault Bugs
                 in Cloud Systems",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "419--431",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177161",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "It is crucial for distributed systems to achieve high
                 availability. Unfortunately, this is challenging given
                 the common component failures (i.e., faults).
                 Developers often cannot anticipate all the timing
                 conditions and system states under which a fault might
                 occur, and introduce time-of-fault (TOF) bugs that only
                 manifest when a node crashes or a message drops at a
                 special moment. Although challenging, detecting TOF
                 bugs is fundamental to developing highly available
                 distributed systems. Unlike previous work that relies
                 on fault injection to expose TOF bugs, this paper
                 carefully models TOF bugs as a new type of concurrency
                 bugs, and develops FCatch to automatically predict TOF
                 bugs by observing correct execution. Evaluation on
                 representative cloud systems shows that FCatch is
                 effective, accurately finding severe TOF bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Deiana:2018:UPN,
  author =       "Enrico A. Deiana and Vincent St-Amour and Peter A.
                 Dinda and Nikos Hardavellas and Simone Campanoni",
  title =        "Unconventional Parallelization of Nondeterministic
                 Applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "432--447",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173181",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The demand for thread-level-parallelism (TLP) on
                 commodity processors is endless as it is essential for
                 gaining performance and saving energy. However, TLP in
                 today's programs is limited by dependences that must be
                 satisfied at run time. We have found that for
                 nondeterministic programs, some of these actual
                 dependences can be satisfied with alternative data that
                 can be generated in parallel, thus boosting the
                 program's TLP. Satisfying these dependences with
                 alternative data nonetheless produces final outputs
                 that match those of the original nondeterministic
                 program. To demonstrate the practicality of our
                 technique, we describe the design, implementation, and
                 evaluation of our compilers, autotuner, profiler, and
                 runtime, which are enabled by our proposed C++
                 programming language extensions. The resulting system
                 boosts the performance of six well-known
                 nondeterministic and multi-threaded benchmarks by
                 158.2\% (geometric mean) on a 28-core Intel-based
                 platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Ji:2018:BGB,
  author =       "Yu Ji and Youhui Zhang and Wenguang Chen and Yuan
                 Xie",
  title =        "Bridge the Gap between Neural Networks and
                 Neuromorphic Hardware with a Neural Network Compiler",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "448--460",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173205",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Different from developing neural networks (NNs) for
                 general-purpose processors, the development for NN
                 chips usually faces with some hardware-specific
                 restrictions, such as limited precision of network
                 signals and parameters, constrained computation scale,
                 and limited types of non-linear functions. This paper
                 proposes a general methodology to address the
                 challenges. We decouple the NN applications from the
                 target hardware by introducing a compiler that can
                 transform an existing trained, unrestricted NN into an
                 equivalent network that meets the given hardware's
                 constraints. We propose multiple techniques to make the
                 transformation adaptable to different kinds of NN
                 chips, and reliable for restrict hardware constraints.
                 We have built such a software tool that supports both
                 spiking neural networks (SNNs) and traditional
                 artificial neural networks (ANNs). We have demonstrated
                 its effectiveness with a fabricated neuromorphic chip
                 and a processing-in-memory (PIM) design. Tests show
                 that the inference error caused by this solution is
                 insignificant and the transformation time is much
                 shorter than the retraining time. Also, we have studied
                 the parameter-sensitivity evaluations to explore the
                 tradeoffs between network error and resource
                 utilization for different transformation strategies,
                 which could provide insights for co-design optimization
                 of neuromorphic hardware and software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Kwon:2018:MEF,
  author =       "Hyoukjun Kwon and Ananda Samajdar and Tushar Krishna",
  title =        "{MAERI}: Enabling Flexible Dataflow Mapping over {DNN}
                 Accelerators via Reconfigurable Interconnects",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "461--475",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173176",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Deep neural networks (DNN) have demonstrated highly
                 promising results across computer vision and speech
                 recognition, and are becoming foundational for
                 ubiquitous AI. The computational complexity of these
                 algorithms and a need for high energy-efficiency has
                 led to a surge in research on hardware accelerators. \%
                 for this paradigm. To reduce the latency and energy
                 costs of accessing DRAM, most DNN accelerators are
                 spatial in nature, with hundreds of processing elements
                 (PE) operating in parallel and communicating with each
                 other directly. DNNs are evolving at a rapid rate, and
                 it is common to have convolution, recurrent, pooling,
                 and fully-connected layers with varying input and
                 filter sizes in the most recent topologies.They may be
                 dense or sparse. They can also be partitioned in myriad
                 ways (within and across layers) to exploit data reuse
                 (weights and intermediate outputs). All of the above
                 can lead to different dataflow patterns within the
                 accelerator substrate. Unfortunately, most DNN
                 accelerators support only fixed dataflow patterns
                 internally as they perform a careful co-design of the
                 PEs and the network-on-chip (NoC). In fact, the
                 majority of them are only optimized for traffic within
                 a convolutional layer. This makes it challenging to map
                 arbitrary dataflows on the fabric efficiently, and can
                 lead to underutilization of the available compute
                 resources. DNN accelerators need to be programmable to
                 enable mass deployment. For them to be programmable,
                 they need to be configurable internally to support the
                 various dataflow patterns that could be mapped over
                 them. To address this need, we present MAERI, which is
                 a DNN accelerator built with a set of modular and
                 configurable building blocks that can easily support
                 myriad DNN partitions and mappings by appropriately
                 configuring tiny switches. MAERI provides 8-459\%
                 better utilization across multiple dataflow mappings
                 over baselines with rigid NoC fabrics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Cai:2018:VHA,
  author =       "Ruizhe Cai and Ao Ren and Ning Liu and Caiwen Ding and
                 Luhao Wang and Xuehai Qian and Massoud Pedram and
                 Yanzhi Wang",
  title =        "{VIBNN}: Hardware Acceleration of {Bayesian} Neural
                 Networks",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "476--488",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173212",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Bayesian Neural Networks (BNNs) have been proposed to
                 address the problem of model uncertainty in training
                 and inference. By introducing weights associated with
                 conditioned probability distributions, BNNs are capable
                 of resolving the overfitting issue commonly seen in
                 conventional neural networks and allow for small-data
                 training, through the variational inference process.
                 Frequent usage of Gaussian random variables in this
                 process requires a properly optimized Gaussian Random
                 Number Generator (GRNG). The high hardware cost of
                 conventional GRNG makes the hardware implementation of
                 BNNs challenging. In this paper, we propose VIBNN, an
                 FPGA-based hardware accelerator design for variational
                 inference on BNNs. We explore the design space for
                 massive amount of Gaussian variable sampling tasks in
                 BNNs. Specifically, we introduce two high performance
                 Gaussian (pseudo) random number generators: (1) the
                 RAM-based Linear Feedback Gaussian Random Number
                 Generator (RLF-GRNG), which is inspired by the
                 properties of binomial distribution and linear feedback
                 logics; and (2) the Bayesian Neural Network-oriented
                 Wallace Gaussian Random Number Generator. To achieve
                 high scalability and efficient memory access, we
                 propose a deep pipelined accelerator architecture with
                 fast execution and good hardware utilization.
                 Experimental results demonstrate that the proposed
                 VIBNN implementations on an FPGA can achieve throughput
                 of 321,543.4 Images/s and energy efficiency upto
                 52,694.8 Images/J while maintaining similar accuracy as
                 its software counterpart.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Sadrosadati:2018:LEH,
  author =       "Mohammad Sadrosadati and Amirhossein Mirhosseini and
                 Seyed Borna Ehsani and Hamid Sarbazi-Azad and Mario
                 Drumond and Babak Falsafi and Rachata Ausavarungnirun
                 and Onur Mutlu",
  title =        "{LTRF}: Enabling High-Capacity Register Files for
                 {GPUs} via Hardware\slash Software Cooperative Register
                 Prefetching",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "489--502",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173211",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphics Processing Units (GPUs) employ large register
                 files to accommodate all active threads and accelerate
                 context switching. Unfortunately, register files are a
                 scalability bottleneck for future GPUs due to long
                 access latency, high power consumption, and large
                 silicon area provisioning. Prior work proposes
                 hierarchical register file, to reduce the register file
                 power consumption by caching registers in a smaller
                 register file cache. Unfortunately, this approach does
                 not improve register access latency due to the low hit
                 rate in the register file cache. In this paper, we
                 propose the Latency-Tolerant Register File (LTRF)
                 architecture to achieve low latency in a two-level
                 hierarchical structure while keeping power consumption
                 low. We observe that compile-time interval analysis
                 enables us to divide GPU program execution into
                 intervals with an accurate estimate of a warp's
                 aggregate register working-set within each interval.
                 The key idea of LTRF is to prefetch the estimated
                 register working-set from the main register file to the
                 register file cache under software control, at the
                 beginning of each interval, and overlap the prefetch
                 latency with the execution of other warps. Our
                 experimental results show that LTRF enables
                 high-capacity yet long-latency main GPU register files,
                 paving the way for various optimizations. As an example
                 optimization, we implement the main register file with
                 emerging high-density high-latency memory technologies,
                 enabling 8X larger capacity and improving overall GPU
                 performance by 31\% while reducing register file power
                 consumption by 46\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Ausavarungnirun:2018:MRG,
  author =       "Rachata Ausavarungnirun and Vance Miller and Joshua
                 Landgraf and Saugata Ghose and Jayneel Gandhi and
                 Adwait Jog and Christopher J. Rossbach and Onur Mutlu",
  title =        "{MASK}: Redesigning the {GPU} Memory Hierarchy to
                 Support Multi-Application Concurrency",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "503--518",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173169",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graphics Processing Units (GPUs) exploit large amounts
                 of threadlevel parallelism to provide high instruction
                 throughput and to efficiently hide long-latency stalls.
                 The resulting high throughput, along with continued
                 programmability improvements, have made GPUs an
                 essential computational resource in many domains.
                 Applications from different domains can have vastly
                 different compute and memory demands on the GPU. In a
                 large-scale computing environment, to efficiently
                 accommodate such wide-ranging demands without leaving
                 GPU resources underutilized, multiple applications can
                 share a single GPU, akin to how multiple applications
                 execute concurrently on a CPU. Multi-application
                 concurrency requires several support mechanisms in both
                 hardware and software. One such key mechanism is
                 virtual memory, which manages and protects the address
                 space of each application. However, modern GPUs lack
                 the extensive support for multi-application concurrency
                 available in CPUs, and as a result suffer from high
                 performance overheads when shared by multiple
                 applications, as we demonstrate. We perform a detailed
                 analysis of which multi-application concurrency support
                 limitations hurt GPU performance the most. We find that
                 the poor performance is largely a result of the virtual
                 memory mechanisms employed in modern GPUs. In
                 particular, poor address translation performance is a
                 key obstacle to efficient GPU sharing. State-of-the-art
                 address translation mechanisms, which were designed for
                 single-application execution, experience significant
                 inter-application interference when multiple
                 applications spatially share the GPU. This contention
                 leads to frequent misses in the shared translation
                 lookaside buffer (TLB), where a single miss can induce
                 long-latency stalls for hundreds of threads. As a
                 result, the GPU often cannot schedule enough threads to
                 successfully hide the stalls, which diminishes system
                 throughput and becomes a first-order performance
                 concern. Based on our analysis, we propose MASK, a new
                 GPU framework that provides low-overhead virtual memory
                 support for the concurrent execution of multiple
                 applications. MASK consists of three novel
                 address-translation-aware cache and memory management
                 mechanisms that work together to largely reduce the
                 overhead of address translation: (1) a token-based
                 technique to reduce TLB contention, (2) a bypassing
                 mechanism to improve the effectiveness of cached
                 address translations, and (3) an application-aware
                 memory scheduling scheme to reduce the interference
                 between address translation and data requests. Our
                 evaluations show that MASK restores much of the
                 throughput lost to TLB contention. Relative to a
                 state-of-the-art GPU TLB, MASK improves system
                 throughput by 57.8\%, improves IPC throughput by
                 43.4\%, and reduces application-level unfairness by
                 22.4\%. MASK's system throughput is within 23.2\% of an
                 ideal GPU system with no address translation
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Yao:2018:SSG,
  author =       "Zhihao Yao and Zongheng Ma and Yingtong Liu and
                 Ardalan Amiri Sani and Aparna Chandramowlishwaran",
  title =        "{Sugar}: Secure {GPU} Acceleration in {Web} Browsers",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "519--534",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173186",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Modern personal computers have embraced increasingly
                 powerful Graphics Processing Units (GPUs). Recently,
                 GPU-based graphics acceleration in web apps (i.e.,
                 applications running inside a web browser) has become
                 popular. WebGL is the main effort to provide
                 OpenGL-like graphics for web apps and it is currently
                 used in 53\% of the top-100 websites. Unfortunately,
                 WebGL has posed serious security concerns as several
                 attack vectors have been demonstrated through WebGL.
                 Web browsers' solutions to these attacks have been
                 reactive: discovered vulnerabilities have been patched
                 and new runtime security checks have been added.
                 Unfortunately, this approach leaves the system
                 vulnerable to zero-day vulnerability exploits,
                 especially given the large size of the Trusted
                 Computing Base of the graphics plane. We present Sugar,
                 a novel operating system solution that enhances the
                 security of GPU acceleration for web apps by design.
                 The key idea behind Sugar is using a dedicated virtual
                 graphics plane for a web app by leveraging modern GPU
                 virtualization solutions. A virtual graphics plane
                 consists of a dedicated virtual GPU (or vGPU) as well
                 as all the software graphics stack (including the
                 device driver). Sugar enhances the system security
                 since a virtual graphics plane is fully isolated from
                 the rest of the system. Despite GPU virtualization
                 overhead, we show that Sugar achieves high performance.
                 Moreover, unlike current systems, Sugar is able to use
                 two underlying physical GPUs, when available, to
                 co-render the User Interface (UI): one GPU is used to
                 provide virtual graphics planes for web apps and the
                 other to provide the primary graphics plane for the
                 rest of the system. Such a design not only provides
                 strong security guarantees, it also provides enhanced
                 performance isolation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Hsu:2018:SRP,
  author =       "Chang-Hong Hsu and Qingyuan Deng and Jason Mars and
                 Lingjia Tang",
  title =        "{SmoothOperator}: Reducing Power Fragmentation and
                 Improving Power Utilization in Large-scale
                 Datacenters",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "535--548",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173190",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "With the ever growing popularity of cloud computing
                 and web services, Internet companies are in need of
                 increased computing capacity to serve the demand.
                 However, power has become a major limiting factor
                 prohibiting the growth in industry: it is often the
                 case that no more servers can be added to datacenters
                 without surpassing the capacity of the existing power
                 infrastructure. In this work, we first investigate the
                 power utilization in Facebook datacenters. We observe
                 that the combination of provisioning for peak power
                 usage, highly fluctuating traffic, and multi-level
                 power delivery infrastructure leads to significant
                 power budget fragmentation problem and inefficiently
                 low power utilization. To address this issue, our
                 insight is that heterogeneity of power consumption
                 patterns among different services provides
                 opportunities to re-shape the power profile of each
                 power node by re-distributing services. By grouping
                 services with asynchronous peak times under the same
                 power node, we can reduce the peak power of each node
                 and thus creating more power head-rooms to allow more
                 servers hosted, achieving higher throughput. Based on
                 this insight, we develop a workload-aware service
                 placement framework to systematically spread the
                 service instances with synchronous power patterns
                 evenly under the power supply tree, greatly reducing
                 the peak power draw at power nodes. We then leverage
                 dynamic power profile reshaping to maximally utilize
                 the headroom unlocked by our placement framework. Our
                 experiments based on real production workload and power
                 traces show that we are able to host up to 13\% more
                 machines in production, without changing the underlying
                 power infrastructure. Utilizing the unleashed power
                 headroom with dynamic reshaping, we achieve up to an
                 estimated total of 15\% and 11\% throughput improvement
                 for latency-critical service and batch service
                 respectively at the same time, with up to 44\% of
                 energy slack reduction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Lee:2018:WPE,
  author =       "Jaewon Lee and Changkyu Kim and Kun Lin and Liqun
                 Cheng and Rama Govindaraju and Jangwoo Kim",
  title =        "{WSMeter}: a Performance Evaluation Methodology for
                 {Google}'s Production Warehouse-Scale Computers",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "549--563",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173196",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Evaluating the comprehensive performance of a
                 warehouse-scale computer (WSC) has been a long-standing
                 challenge. Traditional load-testing benchmarks become
                 ineffective because they cannot accurately reproduce
                 the behavior of thousands of distinct jobs co-located
                 on a WSC. We therefore evaluate WSCs using actual job
                 behaviors in live production environments. From our
                 experience of developing multiple generations of WSCs,
                 we identify two major challenges of this approach: (1)
                 the lack of a holistic metric that incorporates
                 thousands of jobs and summarizes the performance, and
                 (2) the high costs and risks of conducting an
                 evaluation in a live environment. To address these
                 challenges, we propose WSMeter, a cost-effective
                 methodology to accurately evaluate a WSC's performance
                 using a live production environment. We first define a
                 new metric which accurately represents a WSC's overall
                 performance, taking a wide variety of unevenly
                 distributed jobs into account. We then propose a model
                 to statistically embrace the performance variance
                 inherent in WSCs, to conduct an evaluation with minimal
                 costs and risks. We present three real-world use cases
                 to prove the effectiveness of WSMeter. In the first two
                 cases, WSMeter accurately discerns 7\% and 1\%
                 performance improvements from WSC upgrades using only
                 0.9\% and 6.6\% of the machines in the WSCs,
                 respectively. We emphasize that naive statistical
                 comparisons incur much higher evaluation costs ($ < 4 $
                 times) and sometimes even fail to distinguish subtle
                 differences. The third case shows that a cloud customer
                 hosting two services on our WSC quantifies the
                 performance benefits of software optimization (+9.3\%)
                 with minimal overheads (2.3\% of the service
                 capacity).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Yu:2018:DAH,
  author =       "Zhibin Yu and Zhendong Bei and Xuehai Qian",
  title =        "Datasize-Aware High Dimensional Configurations
                 Auto-Tuning of In-Memory Cluster Computing",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "564--577",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173187",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In-Memory cluster Computing (IMC) frameworks (e.g.,
                 Spark) have become increasingly important because they
                 typically achieve more than 10$ \times $ speedups over
                 the traditional On-Disk cluster Computing (ODC)
                 frameworks for iterative and interactive applications.
                 Like ODC, IMC frameworks typically run the same given
                 programs repeatedly on a given cluster with similar
                 input dataset size each time. It is challenging to
                 build performance model for IMC program because: (1)
                 the performance of IMC programs is more sensitive to
                 the size of input dataset, which is known to be
                 difficult to be incorporated into a performance model
                 due to its complex effects on performance; (2) the
                 number of performance-critical configuration parameters
                 in IMC is much larger than ODC (more than 40 vs. around
                 10), the high dimensionality requires more
                 sophisticated models to achieve high accuracy. To
                 address this challenge, we propose DAC, a
                 datasize-aware auto-tuning approach to efficiently
                 identify the high dimensional configuration for a given
                 IMC program to achieve optimal performance on a given
                 cluster. DAC is a significant advance over the
                 state-of-the-art because it can take the size of input
                 dataset and 41 configuration parameters as the
                 parameters of the performance model for a given IMC
                 program, --- unprecedented in previous work. It is made
                 possible by two key techniques: (1) Hierarchical
                 Modeling (HM), which combines a number of individual
                 sub-models in a hierarchical manner; (2) Genetic
                 Algorithm (GA) is employed to search the optimal
                 configuration. To evaluate DAC, we use six typical
                 Spark programs, each with five different input dataset
                 sizes. The evaluation results show that DAC improves
                 the performance of six typical Spark programs, each
                 with five different input dataset sizes compared to
                 default configurations by a factor of 30.4x on average
                 and up to 89x. We also report that the geometric mean
                 speedups of DAC over configurations by default, expert,
                 and RFHOC are 15.4x, 2.3x, and 1.5x, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Ainsworth:2018:ETP,
  author =       "Sam Ainsworth and Timothy M. Jones",
  title =        "An Event-Triggered Programmable Prefetcher for
                 Irregular Workloads",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "578--592",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173189",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many modern workloads compute on large amounts of
                 data, often with irregular memory accesses. Current
                 architectures perform poorly for these workloads, as
                 existing prefetching techniques cannot capture the
                 memory access patterns; these applications end up
                 heavily memory-bound as a result. Although a number of
                 techniques exist to explicitly configure a prefetcher
                 with traversal patterns, gaining significant speedups,
                 they do not generalise beyond their target data
                 structures. Instead, we propose an event-triggered
                 programmable prefetcher combining the flexibility of a
                 general-purpose computational unit with an event-based
                 programming model, along with compiler techniques to
                 automatically generate events from the original source
                 code with annotations. This allows more complex
                 fetching decisions to be made, without needing to stall
                 when intermediate results are required. Using our
                 programmable prefetching system, combined with small
                 prefetch kernels extracted from applications, we
                 achieve an average 3.0x speedup in simulation for a
                 variety of graph, database and HPC workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Zhang:2018:MLO,
  author =       "Dan Zhang and Xiaoyu Ma and Michael Thomson and Derek
                 Chiou",
  title =        "{Minnow}: Lightweight Offload Engines for Worklist
                 Management and Worklist-Directed Prefetching",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "593--607",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173197",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The importance of irregular applications such as graph
                 analytics is rapidly growing with the rise of Big Data.
                 However, parallel graph workloads tend to perform
                 poorly on general-purpose chip multiprocessors (CMPs)
                 due to poor cache locality, low compute intensity,
                 frequent synchronization, uneven task sizes, and
                 dynamic task generation. At high thread counts,
                 execution time is dominated by worklist synchronization
                 overhead and cache misses. Researchers have proposed
                 hardware worklist accelerators to address scheduling
                 costs, but these proposals often harden a specific
                 scheduling policy and do not address high cache miss
                 rates. We address this with Minnow, a technique that
                 augments each core in a CMP with a lightweight Minnow
                 accelerator. Minnow engines offload worklist scheduling
                 from worker threads to improve scalability. The engines
                 also perform worklist-directed prefetching, a technique
                 that exploits knowledge of upcoming tasks to issue
                 nearly perfectly accurate and timely prefetch
                 operations. On a simulated 64-core CMP running a
                 parallel graph benchmark suite, Minnow improves
                 scalability and reduces L2 cache misses from 29 to 1.2
                 MPKI on average, resulting in 6.01x average speedup
                 over an optimized software baseline for only 1\% area
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Zhang:2018:WNA,
  author =       "Mingxing Zhang and Yongwei Wu and Youwei Zhuo and
                 Xuehai Qian and Chengying Huan and Kang Chen",
  title =        "{Wonderland}: a Novel Abstraction-Based Out-Of-Core
                 Graph Processing System",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "608--621",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173208",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many important graph applications are iterative
                 algorithms that repeatedly process the input graph
                 until convergence. For such algorithms, graph
                 abstraction is an important technique: although much
                 smaller than the original graph, it can bootstrap an
                 initial result that can significantly accelerate the
                 final convergence speed, leading to a better overall
                 performance. However, existing graph abstraction
                 techniques typically assume either fully in-memory or
                 distributed environment, which leads to many obstacles
                 preventing the application to an out-of-core graph
                 processing system. In this paper, we propose
                 Wonderland, a novel out-of-core graph processing system
                 based on abstraction. Wonderland has three unique
                 features: (1) A simple method applicable to out-of-core
                 systems allowing users to extract effective
                 abstractions from the original graph with acceptable
                 cost and a specific memory limit; (2)
                 Abstraction-enabled information propagation, where an
                 abstraction can be used as a bridge over the disjoint
                 on-disk graph partitions; (3) Abstraction guided
                 priority scheduling, where an abstraction can infer the
                 better priority-based order in processing on-disk graph
                 partitions. Wonderland is a significant advance over
                 the state-of-the-art because it not only makes graph
                 abstraction feasible to out-of-core systems, but also
                 broadens the applications of the concept in important
                 ways. Evaluation results of Wonderland reveal that
                 Wonderland achieves a drastic speedup over the other
                 state-of-the-art systems, up to two orders of magnitude
                 for certain cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Sabet:2018:TTI,
  author =       "Amir Hossein Nodehi Sabet and Junqiao Qiu and Zhijia
                 Zhao",
  title =        "{Tigr}: Transforming Irregular Graphs for
                 {GPU}-Friendly Graph Processing",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "622--636",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173180",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Graph analytics delivers deep knowledge by processing
                 large volumes of highly connected data. In real-world
                 graphs, the degree distribution tends to follow the
                 power law --- a small portion of nodes own a large
                 number of neighbors. The high irregularity of degree
                 distribution acts as a major barrier to their efficient
                 processing on GPU architectures, which are primarily
                 designed for accelerating computations on regular data
                 with SIMD executions. Existing solutions to the
                 inefficiency of GPU-based graph analytics either modify
                 the graph programming abstraction or rely on changes to
                 the low-level thread execution models. The former
                 requires more programming efforts for designing and
                 maintaining graph analytics; while the latter couples
                 with the underlying architectures, making it difficult
                 to adapt as architectures quickly evolve. Unlike prior
                 efforts, this work proposes to address the above
                 fundamental problem at its origin --- the irregular
                 graph data itself. It raises a critical question in
                 irregular graph processing: Is it possible to transform
                 irregular graphs into more regular ones such that the
                 graphs can be processed more efficiently on GPU-like
                 architectures, yet still producing the same results?
                 Inspired by the question, this work introduces Tigr ---
                 a graph transformation framework that can effectively
                 reduce the irregularity of real-world graphs with
                 correctness guarantees for a wide range of graph
                 analytics. To make the transformations practical, Tigr
                 features a lightweight virtual transformation scheme,
                 which can substantially reduce the costs of graph
                 transformations, while preserving the benefits of
                 reduced irregularity. Evaluation on Tigr-based GPU
                 graph processing shows significant and consistent
                 speedup over the state-of-the-art GPU graph processing
                 frameworks for a spectrum of irregular graphs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Haria:2018:DMH,
  author =       "Swapnil Haria and Mark D. Hill and Michael M. Swift",
  title =        "Devirtualizing Memory in Heterogeneous Systems",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "637--650",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173194",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Accelerators are increasingly recognized as one of the
                 major drivers of future computational growth. For
                 accelerators, shared virtual memory (VM) promises to
                 simplify programming and provide safe data sharing with
                 CPUs. Unfortunately, the overheads of virtual memory,
                 which are high for general-purpose processors, are even
                 higher for accelerators. Providing accelerators with
                 direct access to physical memory (PM) in contrast,
                 provides high performance but is both unsafe and more
                 difficult to program. We propose Devirtualized Memory
                 (DVM) to combine the protection of VM with direct
                 access to PM. By allocating memory such that physical
                 and virtual addresses are almost always identical
                 (VA==PA), DVM mostly replaces page-level address
                 translation with faster region-level Devirtualized
                 Access Validation (DAV). Optionally on read accesses,
                 DAV can be overlapped with data fetch to hide VM
                 overheads. DVM requires modest OS and IOMMU changes,
                 and is transparent to the application. Implemented in
                 Linux 4.10, DVM reduces VM overheads in a
                 graph-processing accelerator to just 1.6\% on average.
                 DVM also improves performance by 2.1X over an optimized
                 conventional VM implementation, while consuming 3.9X
                 less dynamic energy for memory management. We further
                 discuss DVM's potential to extend beyond accelerators
                 to CPUs, where it reduces VM overheads to 5\% on
                 average, down from 29\% for conventional VM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Kumar:2018:LLT,
  author =       "Mohan Kumar Kumar and Steffen Maass and Sanidhya
                 Kashyap and J{\'a}n Vesel{\'y} and Zi Yan and Taesoo
                 Kim and Abhishek Bhattacharjee and Tushar Krishna",
  title =        "{LATR}: Lazy Translation Coherence",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "651--664",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173198",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose LATR-lazy TLB coherence-a software-based
                 TLB shootdown mechanism that can alleviate the overhead
                 of the synchronous TLB shootdown mechanism in existing
                 operating systems. By handling the TLB coherence in a
                 lazy fashion, LATR can avoid expensive IPIs which are
                 required for delivering a shootdown signal to remote
                 cores, and the performance overhead of associated
                 interrupt handlers. Therefore, virtual memory
                 operations, such as free and page migration operations,
                 can benefit significantly from LATR's mechanism. For
                 example, LATR improves the latency of munmap() by
                 70.8\% on a 2-socket machine, a widely used
                 configuration in modern data centers. Real-world,
                 performance-critical applications such as web servers
                 can also benefit from LATR: without any
                 application-level changes, LATR improves Apache by
                 59.9\% compared to Linux, and by 37.9\% compared to
                 ABIS, a highly optimized, state-of-the-art TLB
                 coherence technique.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Taassori:2018:VRP,
  author =       "Meysam Taassori and Ali Shafiee and Rajeev
                 Balasubramonian",
  title =        "{VAULT}: Reducing Paging Overheads in {SGX} with
                 Efficient Integrity Verification Structures",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "665--678",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177155",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Intel's SGX offers state-of-the-art security features,
                 including confidentiality, integrity, and
                 authentication (CIA) when accessing sensitive pages in
                 memory. Sensitive pages are placed in an Enclave Page
                 Cache (EPC) within the physical memory before they can
                 be accessed by the processor. To control the overheads
                 imposed by CIA guarantees, the EPC operates with a
                 limited capacity (currently 128 MB). Because of this
                 limited EPC size, sensitive pages must be frequently
                 swapped between EPC and non-EPC regions in memory. A
                 page swap is expensive (about 40K cycles) because it
                 requires an OS system call, page copying, updates to
                 integrity trees and metadata, etc. Our analysis shows
                 that the paging overhead can slow the system on average
                 by 5$ \times $, and other studies have reported even
                 higher slowdowns for memory-intensive workloads. The
                 paging overhead can be reduced by growing the size of
                 the EPC to match the size of physical memory, while
                 allowing the EPC to also accommodate non-sensitive
                 pages. However, at least two important problems must be
                 addressed to enable this growth in EPC: (i) the depth
                 of the integrity tree and its cacheability must be
                 improved to keep memory bandwidth overheads in check,
                 (ii) the space overheads of integrity verification
                 (tree and MACs) must be reduced. We achieve both goals
                 by introducing a variable arity unified tree (VAULT)
                 organization that is more compact and has lower depth.
                 We further reduce the space overheads with techniques
                 that combine MAC sharing and compression. With
                 simulations, we show that the combination of our
                 techniques can address most inefficiencies in SGX
                 memory access and improve overall performance by 3.7$
                 \times $, relative to an SGX baseline, while incurring
                 a memory capacity over-head of only 4.7\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Panwar:2018:MHP,
  author =       "Ashish Panwar and Aravinda Prasad and K. Gopinath",
  title =        "Making Huge Pages Actually Useful",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "679--692",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173203",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The virtual-to-physical address translation overhead,
                 a major performance bottleneck for modern workloads,
                 can be effectively alleviated with huge pages. However,
                 since huge pages must be mapped contiguously, OSs have
                 not been able to use them well because of the memory
                 fragmentation problem despite hardware support for huge
                 pages being available for nearly two decades. This
                 paper presents a comprehensive study of the interaction
                 of fragmentation with huge pages in the Linux kernel.
                 We observe that when huge pages are used, problems such
                 as high CPU utilization and latency spikes occur
                 because of unnecessary work (e.g., useless page
                 migration) performed by memory management related
                 subsystems due to the poor handling of unmovable (i.e.,
                 kernel) pages. This behavior is even more harmful in
                 virtualized systems where unnecessary work may be
                 performed in both guest and host OSs. We present
                 Illuminator, an efficient memory manager that provides
                 various subsystems, such as the page allocator, the
                 ability to track all unmovable pages. It allows
                 subsystems to make informed decisions and eliminate
                 unnecessary work which in turn leads to cost-effective
                 huge page allocations. Illuminator reduces the cost of
                 compaction (up to 99\%), improves application
                 performance (up to 2.3x) and reduces the maximum
                 latency of MySQL database server (by 30x). Importantly,
                 this work shows the effectiveness of a simple solution
                 for long-standing huge page related problems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Evtyushkin:2018:BNS,
  author =       "Dmitry Evtyushkin and Ryan Riley and Nael CSE and ECE
                 Abu-Ghazaleh and Dmitry Ponomarev",
  title =        "{BranchScope}: a New Side-Channel Attack on
                 Directional Branch Predictor",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "693--707",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173204",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present BranchScope --- a new side-channel attack
                 where the attacker infers the direction of an arbitrary
                 conditional branch instruction in a victim program by
                 manipulating the shared directional branch predictor.
                 The directional component of the branch predictor
                 stores the prediction on a given branch (taken or
                 not-taken) and is a different component from the branch
                 target buffer (BTB) attacked by previous work.
                 BranchScope is the first fine-grained attack on the
                 directional branch predictor, expanding our
                 understanding of the side channel vulnerability of the
                 branch prediction unit. Our attack targets complex
                 hybrid branch predictors with unknown organization. We
                 demonstrate how an attacker can force these predictors
                 to switch to a simple 1-level mode to simplify the
                 direction recovery. We carry out BranchScope on several
                 recent Intel CPUs and also demonstrate the attack
                 against an SGX enclave.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Dickens:2018:SCI,
  author =       "Bernard {Dickens III} and Haryadi S. Gunawi and Ariel
                 J. Feldman and Henry Hoffmann",
  title =        "{StrongBox}: Confidentiality, Integrity, and
                 Performance using Stream Ciphers for Full Drive
                 Encryption",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "708--721",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173183",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Full-drive encryption (FDE) is especially important
                 for mobile devices because they contain large
                 quantities of sensitive data yet are easily lost or
                 stolen. Unfortunately, the standard approach to FDE-the
                 AES block cipher in XTS mode-is 3--5$ \times $ slower
                 than unencrypted storage. Authenticated encryption
                 based on stream ciphers is already used as a faster
                 alternative to AES in other contexts, such as HTTPS,
                 but the conventional wisdom is that stream ciphers are
                 unsuitable for FDE. Used naively in drive encryption,
                 stream ciphers are vulnerable to attacks, and
                 mitigating these attacks with on-drive metadata is
                 generally believed to ruin performance. In this paper,
                 we argue that recent developments in mobile hardware
                 invalidate this assumption, making it possible to use
                 fast stream ciphers for FDE. Modern mobile devices
                 employ solid-state storage with Flash Translation
                 Layers (FTL), which operate similarly to Log-structured
                 File Systems (LFS). They also include trusted hardware
                 such as Trusted Execution Environments (TEEs) and
                 secure storage areas. Leveraging these two trends, we
                 propose StrongBox, a stream cipher-based FDE layer that
                 is a drop-in replacement for dm-crypt, the standard
                 Linux FDE module based on AES-XTS. StrongBox introduces
                 a system design and on-drive data structures that
                 exploit LFS's lack of overwrites to avoid costly
                 rekeying and a counter stored in trusted hardware to
                 protect against attacks. We implement StrongBox on an
                 ARM big.LITTLE mobile processor and test its
                 performance under multiple popular production LFSes. We
                 find that StrongBox improves read performance by as
                 much as 2.36$ \times $ (1.72$ \times $ on average)
                 while offering stronger integrity guarantees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Hunger:2018:DDC,
  author =       "Casen Hunger and Lluis Vilanova and Charalampos
                 Papamanthou and Yoav Etsion and Mohit Tiwari",
  title =        "{DATS} --- Data Containers for {Web} Applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "722--736",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173213",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Data containers enable users to control access to
                 their data while untrusted applications compute on it.
                 However, they require replicating an application inside
                 each container --- compromising functionality,
                 programmability, and performance. We propose DATS --- a
                 system to run web applications that retains application
                 usability and efficiency through a mix of hardware
                 capability enhanced containers and the introduction of
                 two new primitives modeled after the popular
                 model-view-controller (MVC) pattern. (1) DATS
                 introduces a templating language to create views that
                 compose data across data containers. (2) DATS uses
                 authenticated storage and confinement to enable an
                 untrusted storage service, such as memcached and
                 deduplication, to operate on plain-text data across
                 containers. These two primitives act as robust
                 declassifiers that allow DATS to enforce
                 non-interference across containers, taking large
                 applications out of the trusted computing base (TCB).
                 We showcase eight different web applications including
                 Gitlab and a Slack-like chat, significantly improve the
                 worst-case overheads due to application replication,
                 and demonstrate usable performance for common-case
                 usage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Mallon:2018:DPP,
  author =       "Stephen Mallon and Vincent Gramoli and Guillaume
                 Jourjon",
  title =        "{DLibOS}: Performance and Protection with a
                 Network-on-Chip",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "737--750",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173209",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A long body of research work has led to the conjecture
                 that highly efficient IO processing at user-level would
                 necessarily violate protection. In this paper, we
                 debunk this myth by introducing DLibOS a new paradigm
                 that consists of distributing a library OS on
                 specialized cores to achieve performance and protection
                 at the user-level. Its main novelty consists of
                 leveraging network-on-chip to allow hardware message
                 passing, rather than context switches, for
                 communication between different address spaces. To
                 demonstrate the feasibility of our approach, we
                 implement a driver and a network stack at user-level on
                 a Tilera many-core machine. We define a novel
                 asynchronous socket interface and partition the memory
                 such that the reception, the transmission and the
                 application modify isolated regions. Our high
                 performance results of 4.2 and 3.1 million requests per
                 second obtained on a webserver and the Memcached
                 applications, respectively, confirms the relevance of
                 our design decisions. Finally, we compare DLibOS
                 against a non-protected user-level network stack and
                 show that protection comes at a negligible cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Lin:2018:AIA,
  author =       "Shih-Chieh Lin and Yunqi Zhang and Chang-Hong Hsu and
                 Matt Skach and Md E. Haque and Lingjia Tang and Jason
                 Mars",
  title =        "The Architectural Implications of Autonomous Driving:
                 Constraints and Acceleration",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "751--766",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173191",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Autonomous driving systems have attracted a
                 significant amount of interest recently, and many
                 industry leaders, such as Google, Uber, Tesla, and
                 Mobileye, have invested a large amount of capital and
                 engineering power on developing such systems. Building
                 autonomous driving systems is particularly challenging
                 due to stringent performance requirements in terms of
                 both making the safe operational decisions and
                 finishing processing at real-time. Despite the recent
                 advancements in technology, such systems are still
                 largely under experimentation and architecting
                 end-to-end autonomous driving systems remains an open
                 research question. To investigate this question, we
                 first present and formalize the design constraints for
                 building an autonomous driving system in terms of
                 performance, predictability, storage, thermal and
                 power. We then build an end-to-end autonomous driving
                 system using state-of-the-art award-winning algorithms
                 to understand the design trade-offs for building such
                 systems. In our real-system characterization, we
                 identify three computational bottlenecks, which
                 conventional multicore CPUs are incapable of processing
                 under the identified design constraints. To meet these
                 constraints, we accelerate these algorithms using three
                 accelerator platforms including GPUs, FPGAs, and ASICs,
                 which can reduce the tail latency of the system by
                 169x, 10x, and 93x respectively. With accelerator-based
                 designs, we are able to build an end-to-end autonomous
                 driving system that meets all the design constraints,
                 and explore the trade-offs among performance, power and
                 the higher accuracy enabled by higher resolution
                 cameras.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Colin:2018:RES,
  author =       "Alexei Colin and Emily Ruppel and Brandon Lucia",
  title =        "A Reconfigurable Energy Storage Architecture for
                 Energy-harvesting Devices",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "767--781",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173210",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Battery-free, energy-harvesting devices operate using
                 energy collected exclusively from their environment.
                 Energy-harvesting devices allow maintenance-free
                 deployment in extreme environments, but requires a
                 power system to provide the right amount of energy when
                 an application needs it. Existing systems must
                 provision energy capacity statically based on an
                 application's peak demand which compromises efficiency
                 and responsiveness when not at peak demand. This work
                 presents Capybara: a co-designed hardware/software
                 power system with dynamically reconfigurable energy
                 storage capacity that meets varied application energy
                 demand. The Capybara software interface allows
                 programmers to specify the energy mode of an
                 application task. Capybara's runtime system
                 reconfigures Capybara's hardware energy capacity to
                 match application demand. Capybara also allows a
                 programmer to write reactive application tasks that
                 pre-allocate a burst of energy that it can spend in
                 response to an asynchronous (e.g., external) event. We
                 instantiated Capybara's hardware design in two EH
                 devices and implemented three reactive sensing
                 applications using its software interface. Capybara
                 improves event detection accuracy by 2x-4x over
                 statically-provisioned energy capacity, maintains
                 response latency within 1.5x of a continuously-powered
                 baseline, and enables reactive applications that are
                 intractable with existing power systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Ma:2018:NNE,
  author =       "Kaisheng Ma and Xueqing Li and Mahmut Taylan Kandemir
                 and Jack Sampson and Vijaykrishnan Narayanan and
                 Jinyang Li and Tongda Wu and Zhibo Wang and Yongpan Liu
                 and Yuan Xie",
  title =        "{NEOFog}: Nonvolatility-Exploiting Optimizations for
                 Fog Computing",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "782--796",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3177154",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nonvolatile processors have emerged as one of the
                 promising solutions for energy harvesting scenarios,
                 among which Wireless Sensor Networks (WSN) provide some
                 of the most important applications. In a typical
                 distributed sensing system, due to difference in
                 location, energy harvester angles, power sources, etc.
                 different nodes may have different amount of energy
                 ready for use. While prior approaches have examined
                 these challenges, they have not done so in the context
                 of the features offered by nonvolatile computing
                 approaches, which disrupt certain foundational
                 assumptions. We propose a new set of
                 nonvolatility-exploiting optimizations and embody them
                 in the NEOFog system architecture. We discuss shifts in
                 the tradeoffs in data and program distribution for
                 nonvolatile processing-based WSNs, showing how
                 non-volatile processing and non-volatile RF support
                 alter the benefits of computation and
                 communication-centric approaches. We also propose a new
                 algorithm specific to nonvolatile sensing systems for
                 load balancing both computation and communication
                 demands. Collectively, the NV-aware optimizations in
                 NEOFog increase the ability to perform in-fog
                 processing by 4.2X and can increase this to 8X if
                 virtualized nodes are 3X multiplexed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Lottarini:2018:VBV,
  author =       "Andrea Lottarini and Alex Ramirez and Joel Coburn and
                 Martha A. Kim and Parthasarathy Ranganathan and Daniel
                 Stodolsky and Mark Wachsler",
  title =        "{\tt vbench}: Benchmarking Video Transcoding in the
                 Cloud",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "797--809",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3173207",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents vbench, a publicly available
                 benchmark for cloud video services. We are the first
                 study, to the best of our knowledge, to characterize
                 the emerging video-as-a-service workload. Unlike prior
                 video processing benchmarks, vbench's videos are
                 algorithmically selected to represent a large
                 commercial corpus of millions of videos. Reflecting the
                 complex infrastructure that processes and hosts these
                 videos, vbench includes carefully constructed metrics
                 and baselines. The combination of validated corpus,
                 baselines, and metrics reveal nuanced tradeoffs between
                 speed, quality, and compression. We demonstrate the
                 importance of video selection with a microarchitectural
                 study of cache, branch, and SIMD behavior. vbench
                 reveals trends from the commercial corpus that are not
                 visible in other video corpuses. Our experiments with
                 GPUs under vbench's scoring scenarios reveal that
                 context is critical: GPUs are well suited for
                 live-streaming, while for video-on-demand shift costs
                 from compute to storage and network.
                 Counterintuitively, they are not viable for popular
                 videos, for which highly compressed, high quality
                 copies are required. We instead find that popular
                 videos are currently well-served by the current
                 trajectory of software encoders.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Blackburn:2018:SDS,
  author =       "Steve Blackburn",
  title =        "Session details: Session 7B: Memory 2",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252965",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Carter:2018:SDS,
  author =       "John Carter",
  title =        "Session details: Session 6B: Datacenters",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252963",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Criswell:2018:SDS,
  author =       "John Criswell",
  title =        "Session details: Session 8A: Security and Protection",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252966",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Gandhi:2018:SDS,
  author =       "Jayneel Gandhi",
  title =        "Session details: Session 6A: {GPU} 2",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252962",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Hoffmann:2018:SDS,
  author =       "Hank Hoffmann",
  title =        "Session details: Session 5A: Concurrency and
                 Parallelism",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252960",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Kim:2018:SDS,
  author =       "Martha Kim",
  title =        "Session details: Session 7A: Irregular Apps and
                 Graphs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252964",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Larus:2018:SDS,
  author =       "James Larus",
  title =        "Session details: Session 2B: Performance Management",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252955",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Lee:2018:SDS,
  author =       "Dongyoon Lee",
  title =        "Session details: Session 3B: Mobile Applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252957",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Liu:2018:SDS,
  author =       "Lei Liu",
  title =        "Session details: Session 1B: Managed Runtimes and
                 Dynamic Translation",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252953",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Lu:2018:SDS,
  author =       "Shan Lu",
  title =        "Session details: Session 4B: Program Analysis",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252959",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Narayanasamy:2018:SDS,
  author =       "Satish Narayanasamy",
  title =        "Session details: Session 3A: Programmable Devices and
                 Co-processors",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252956",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Rossbach:2018:SDS,
  author =       "Christopher J. Rossbach",
  title =        "Session details: Session 2A: {GPUs} 1",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252954",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Sampson:2018:SDS,
  author =       "Adrian Sampson?",
  title =        "Session details: Session 5B Neural Networks",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252961",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Solihin:2018:SDS,
  author =       "Yan Solihin",
  title =        "Session details: Session 8B: Potpourri",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252967",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Torrellas:2018:SDS,
  author =       "Josep Torrellas",
  title =        "Session details: Session 1A: New Architectures",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252952",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Tsafrir:2018:SDS,
  author =       "Dan Tsafrir",
  title =        "Session details: Session 4A: Memory 1",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "2",
  pages =        "??--??",
  month =        feb,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296957.3252958",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:56 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ASPLOS '18 proceedings.",
}

@Article{Wang:2018:HSA,
  author =       "Kunshan Wang and Stephen M. Blackburn and Antony L.
                 Hosking and Michael Norrish",
  title =        "Hop, Skip, \& Jump: Practical On-Stack Replacement for
                 a Cross-Platform Language-Neutral {VM}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "3",
  pages =        "1--16",
  month =        mar,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296975.3186412",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "On-stack replacement (OSR) is a performance-critical
                 technology for many languages, especially dynamic
                 languages. Conventional wisdom, apparent in JavaScript
                 engines such as V8 and SpiderMonkey, is that OSR must
                 be implemented in a low-level (i.e., in assembly) and
                 language-specific way. This paper presents an OSR
                 abstraction based on Swapstack, materialized as the API
                 for a low-level virtual machine, and shows how the
                 abstraction of resumption protocols facilitates an
                 elegant implementation of this API on real hardware.
                 Using an experimental JavaScript implementation, we
                 demonstrate that this API enables the language
                 implementation to perform OSR without the need to deal
                 with machine-level details. We also show that the API
                 itself is implementable on concrete hardware. This work
                 helps crystallize OSR abstractions and, by providing a
                 reusable implementation, brings OSR within reach for
                 more language implementers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '18 proceedings.",
}

@Article{Wang:2018:IDG,
  author =       "Wenwen Wang and Jiacheng Wu and Xiaoli Gong and Tao Li
                 and Pen-Chung Yew",
  title =        "Improving Dynamically-Generated Code Performance on
                 Dynamic Binary Translators",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "3",
  pages =        "17--30",
  month =        mar,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296975.3186413",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The recent transition in the software industry toward
                 dynamically generated code poses a new challenge to
                 existing dynamic binary translation (DBT) systems. A
                 significant re-translation overhead could be introduced
                 due to the maintenance of the consistency between the
                 dynamically-generated guest code and the corresponding
                 translated host code. To address this issue, this paper
                 presents a novel approach to optimize DBT systems for
                 guest applications with dynamically-generated code. The
                 proposed approach can maximize the reuse of previously
                 translated host code to mitigate the re-translation
                 overhead. A prototype based on such an approach has
                 been implemented on an existing DBT system HQEMU.
                 Experimental results on a set of JavaScript
                 applications show that it can achieve a 1.24X
                 performance speedup on average compared to the original
                 HQEMU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '18 proceedings.",
}

@Article{Ma:2018:GEG,
  author =       "Jiacheng Ma and Xiao Zheng and Yaozu Dong and Wentai
                 Li and Zhengwei Qi and Bingsheng He and Haibing Guan",
  title =        "{gMig}: Efficient {GPU} Live Migration Optimized by
                 Software Dirty Page for Full Virtualization",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "3",
  pages =        "31--44",
  month =        mar,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296975.3186414",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "This paper introduces gMig, an open-source and
                 practical GPU live migration solution for full
                 virtualization. By taking advantage of the dirty
                 pattern of GPU workloads, gMig presents the One-Shot
                 Pre-Copy combined with the hashing based Software Dirty
                 Page technique to achieve efficient GPU live migration.
                 Particularly, we propose three approaches for gMig: (1)
                 Dynamic Graphics Address Remapping, which parses and
                 manipulates GPU commands to adjust the address mapping
                 to adapt to a different environment after migration,
                 (2) Software Dirty Page, which utilizes a hashing based
                 approach to detect page modification, overcomes the
                 commodity GPU's hardware limitation, and speeds up the
                 migration by only sending the dirtied pages, (3)
                 One-Shot Pre-Copy, which greatly reduces the rounds of
                 pre-copy of graphics memory. Our evaluation shows that
                 gMig achieves GPU live migration with an average
                 downtime of 302 ms on Windows and 119 ms on Linux. With
                 the help of Software Dirty Page, the number of GPU
                 pages transferred during the downtime is effectively
                 reduced by 80.0\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '18 proceedings.",
}

@Article{Ruprecht:2018:VLM,
  author =       "Adam Ruprecht and Danny Jones and Dmitry Shiraev and
                 Greg Harmon and Maya Spivak and Michael Krebs and Miche
                 Baker-Harvey and Tyler Sanderson",
  title =        "{VM} Live Migration At Scale",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "3",
  pages =        "45--56",
  month =        mar,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296975.3186415",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Uninterrupted uptime is a critical aspect of Virtual
                 Machines (VMs) offered by cloud hosting providers.
                 Google's VMs run on top of rapidly changing
                 infrastructure: we regularly update hardware and host
                 software, and we must quickly respond to failing
                 hardware. Frequent change is critical to both
                 development velocity---deploying new versions of
                 services and infrastructure---and the ability to
                 respond rapidly to defects, including critical security
                 fixes. Typically these updates would be disruptive,
                 resulting in VM termination or restart. In this paper
                 we present how we use VM live migration at scale to
                 eliminate this disruption with minimal impact to the
                 guest, performing over 1,000,0001migrations monthly in
                 our production fleet, with 50ms median blackout, 300ms
                 99th percentile blackout.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '18 proceedings.",
}

@Article{Xu:2018:DES,
  author =       "Yu Xu and Jianguo Yao and Yaozu Dong and Kun Tian and
                 Xiao Zheng and Haibing Guan",
  title =        "{Demon}: an Efficient Solution for on-Device {MMU}
                 Virtualization in Mediated Pass-Through",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "3",
  pages =        "57--70",
  month =        mar,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296975.3186416",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Memory Management Units (MMUs) for on-device address
                 translation are widely used in modern devices. However,
                 conventional solutions for on-device MMU
                 virtualization, such as shadow page table implemented
                 in mediated pass-through, still suffer from high
                 complexity and low performance. We present Demon, an
                 efficient solution for on-DEvice MMU virtualizatiON in
                 mediated pass-through. The key insight is that Demon
                 takes advantage of IOMMU to construct a two-dimensional
                 address translation and dynamically switches the
                 2nd-dimensional page table to a proper candidate when
                 the device owner switches. In order to support
                 fine-grained parallelism for the device with multiple
                 engines, we put forward a hardware proposal that
                 separates the address space of each engine and enables
                 simultaneous device address remapping for multiple
                 virtual machines (VMs). We implement Demon with a
                 prototype named gDemon which virtualizes Intel GPU MMU.
                 Nonetheless, Demon is not limited to this particular
                 case. Evaluations show that gDemon provides up to
                 19.73x better performance in the media transcoding
                 workloads and achieves performance improvement of up to
                 17.09\% and 13.73\% in the 2D benchmarks and 3D
                 benchmarks, respectively, compared with gVirt. The
                 current release of gDemon scales up to 6 VMs with
                 moderate performance in our experiments. In addition,
                 gDemon simplifies the implementation of GPU MMU
                 virtualization with 37\% code reduction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '18 proceedings.",
}

@Article{Yan:2018:FPS,
  author =       "Qiuchen Yan and Stephen McCamant",
  title =        "{Fast PokeEMU}: Scaling Generated Instruction Tests
                 Using Aggregation and State Chaining",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "3",
  pages =        "71--83",
  month =        mar,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296975.3186417",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Software that emulates a CPU has many applications,
                 but is difficult to implement correctly and requires
                 extensive testing. Since a large number of test cases
                 are required for full coverage, it is important that
                 the tests execute efficiently. We explore techniques
                 for combining many instruction tests into one program
                 to amortize overheads such as booting an emulator. To
                 ensure the results of each test are reflected in a
                 final result, we use the outputs of one instruction
                 test as an input to the next, and adopt the ``Feistel
                 network'' construction from cryptography so that each
                 step is invertible. We evaluate this approach by
                 applying it to PokeEMU, a tool that generates emulator
                 tests using symbolic execution. The combined tests run
                 much faster, but still reveal most of the same behavior
                 differences as when run individually.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '18 proceedings.",
}

@Article{Rigger:2018:AXI,
  author =       "Manuel Rigger and Stefan Marr and Stephen Kell and
                 David Leopoldseder and Hanspeter M{\"o}ssenb{\"o}ck",
  title =        "An Analysis of x86-64 Inline Assembly in {C}
                 Programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "3",
  pages =        "84--99",
  month =        mar,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296975.3186418",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "C codebases frequently embed nonportable and
                 unstandardized elements such as inline assembly code.
                 Such elements are not well understood, which poses a
                 problem to tool developers who aspire to support C
                 code. This paper investigates the use of x86-64 inline
                 assembly in 1264 C projects from GitHub and combines
                 qualitative and quantitative analyses to answer
                 questions that tool authors may have. We found that
                 28.1\% of the most popular projects contain inline
                 assembly code, although the majority contain only a few
                 fragments with just one or two instructions. The most
                 popular instructions constitute a small subset
                 concerned largely with multicore semantics, performance
                 optimization, and hardware control. Our findings are
                 intended to help developers of C-focused tools, those
                 testing compilers, and language designers seeking to
                 reduce the reliance on inline assembly. They may also
                 aid the design of tools focused on inline assembly
                 itself.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "VEE '18 proceedings.",
}

@Article{Panchekha:2018:VWP,
  author =       "Pavel Panchekha and Adam T. Geller and Michael D.
                 Ernst and Zachary Tatlock and Shoaib Kamil",
  title =        "Verifying that web pages have accessible layout",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "1--14",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192407",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Usability and accessibility guidelines aim to make
                 graphical user interfaces accessible to all users, by,
                 say, requiring that text is sufficiently large,
                 interactive controls are visible, and heading size
                 corresponds to importance. These guidelines must hold
                 on the infinitely many possible renderings of a web
                 page generated by differing screen sizes, fonts, and
                 other user preferences. Today, these guidelines are
                 tested by manual inspection of a few renderings,
                 because (1) the guidelines are not expressed in a
                 formal language, (2) the semantics of browser rendering
                 are not well understood, and (3) no tools exist to
                 check all possible renderings of a web page. VizAssert
                 solves these problems. First, it introduces visual
                 logic to precisely specify accessibility properties.
                 Second, it formalizes a large fragment of the browser
                 rendering algorithm using novel finitization
                 reductions. Third, it provides a sound, automated tool
                 for verifying assertions in visual logic. We encoded 14
                 assertions drawn from best-practice accessibility and
                 mobile-usability guidelines in visual logic. VizAssert
                 checked them on on 62 professionally designed web
                 pages. It found 64 distinct errors in the web pages,
                 while reporting only 13 false positive warnings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Vilk:2018:BAD,
  author =       "John Vilk and Emery D. Berger",
  title =        "{BLeak}: automatically debugging memory leaks in web
                 applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "15--29",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192376",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Despite the presence of garbage collection in managed
                 languages like JavaScript, memory leaks remain a
                 serious problem. In the context of web applications,
                 these leaks are especially pervasive and difficult to
                 debug. Web application memory leaks can take many
                 forms, including failing to dispose of unneeded event
                 listeners, repeatedly injecting iframes and CSS files,
                 and failing to call cleanup routines in third-party
                 libraries. Leaks degrade responsiveness by increasing
                 GC frequency and overhead, and can even lead to browser
                 tab crashes by exhausting available memory. Because
                 previous leak detection approaches designed for
                 conventional C, C++ or Java applications are
                 ineffective in the browser environment, tracking down
                 leaks currently requires intensive manual effort by web
                 developers. This paper introduces BLeak (Browser Leak
                 debugger), the first system for automatically debugging
                 memory leaks in web applications. BLeak's algorithms
                 leverage the observation that in modern web
                 applications, users often repeatedly return to the same
                 (approximate) visual state (e.g., the inbox view in
                 Gmail). Sustained growth between round trips is a
                 strong indicator of a memory leak. To use BLeak, a
                 developer writes a short script (17-73 LOC on our
                 benchmarks) to drive a web application in round trips
                 to the same visual state. BLeak then automatically
                 generates a list of leaks found along with their root
                 causes, ranked by return on investment. Guided by
                 BLeak, we identify and fix over 50 memory leaks in
                 popular libraries and apps including Airbnb, AngularJS,
                 Google Analytics, Google Maps SDK, and jQuery. BLeak's
                 median precision is 100\%; fixing the leaks it
                 identifies reduces heap growth by an average of 94\%,
                 saving from 0.5 MB to 8 MB per round trip. We believe
                 BLeak's approach to be broadly applicable beyond web
                 applications, including to GUI applications on desktop
                 and mobile platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Baxter:2018:PAS,
  author =       "Samuel Baxter and Rachit Nigam and Joe Gibbs Politz
                 and Shriram Krishnamurthi and Arjun Guha",
  title =        "Putting in all the stops: execution control for
                 {JavaScript}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "30--45",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192370",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Scores of compilers produce JavaScript, enabling
                 programmers to use many languages on the Web, reuse
                 existing code, and even use Web IDEs. Unfortunately,
                 most compilers inherit the browser's compromised
                 execution model, so long-running programs freeze the
                 browser tab, infinite loops crash IDEs, and so on. The
                 few compilers that avoid these problems suffer poor
                 performance and are difficult to engineer. This paper
                 presents Stopify, a source-to-source compiler that
                 extends JavaScript with debugging abstractions and
                 blocking operations, and easily integrates with
                 existing compilers. We apply Stopify to ten programming
                 languages and develop a Web IDE that supports stopping,
                 single-stepping, breakpointing, and long-running
                 computations. For nine languages, Stopify requires no
                 or trivial compiler changes. For eight, our IDE is the
                 first that provides these features. Two of our subject
                 languages have compilers with similar features.
                 Stopify's performance is competitive with these
                 compilers and it makes them dramatically simpler.
                 Stopify's abstractions rely on first-class
                 continuations, which it provides by compiling
                 JavaScript to JavaScript. We also identify
                 sub-languages of JavaScript that compilers implicitly
                 use, and exploit these to improve performance. Finally,
                 Stopify needs to repeatedly interrupt and resume
                 program execution. We use a sampling-based technique to
                 estimate program speed that outperforms other
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Gogte:2018:PSF,
  author =       "Vaibhav Gogte and Stephan Diestelhorst and William
                 Wang and Satish Narayanasamy and Peter M. Chen and
                 Thomas F. Wenisch",
  title =        "Persistency for synchronization-free regions",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "46--61",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192367",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Nascent persistent memory (PM) technologies promise
                 the performance of DRAM with the durability of disk,
                 but how best to integrate them into programming systems
                 remains an open question. Recent work extends language
                 memory models with a persistency model prescribing
                 semantics for updates to PM. These semantics enable
                 programmers to design data structures in PM that are
                 accessed like memory and yet are recoverable upon crash
                 or failure. Alas, we find the semantics and performance
                 of existing approaches unsatisfying. Existing
                 approaches require high-overhead mechanisms, are
                 restricted to certain synchronization constructs,
                 provide incomplete semantics, and/or may recover to
                 state that cannot arise in fault-free execution. We
                 propose persistency semantics that guarantee failure
                 atomicity of synchronization-free regions (SFRs) ---
                 program regions delimited by synchronization
                 operations. Our approach provides clear semantics for
                 the PM state recovery code may observe and extends
                 C++11's ``sequential consistency for data-race-free''
                 guarantee to post-failure recovery code. We investigate
                 two designs for failure-atomic SFRs that vary in
                 performance and the degree to which commit of
                 persistent state may lag execution. We demonstrate both
                 approaches in LLVM v3.6.0 and compare to a
                 state-of-the-art baseline to show performance
                 improvement up to 87.5\% (65.5\% avg).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Akram:2018:WRG,
  author =       "Shoaib Akram and Jennifer B. Sartor and Kathryn S.
                 McKinley and Lieven Eeckhout",
  title =        "Write-rationing garbage collection for hybrid
                 memories",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "62--77",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192392",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Emerging Non-Volatile Memory (NVM) technologies offer
                 high capacity and energy efficiency compared to DRAM,
                 but suffer from limited write endurance and longer
                 latencies. Prior work seeks the best of both
                 technologies by combining DRAM and NVM in hybrid
                 memories to attain low latency, high capacity, energy
                 efficiency, and durability. Coarsegrained hardware and
                 OS optimizations then spread writes out (wear-leveling)
                 and place highly mutated pages in DRAM to extend NVM
                 lifetimes. Unfortunately even with these coarse-grained
                 methods, popular Java applications exact impractical
                 NVM lifetimes of 4 years or less. This paper shows how
                 to make hybrid memories practical, without changing the
                 programming model, by enhancing garbage collection in
                 managed language runtimes. We find object write
                 behaviors offer two opportunities: (1) 70\% of writes
                 occur to newly allocated objects, and (2) 2\% of
                 objects capture 81\% of writes to mature objects. We
                 introduce write-rationing garbage collectors that
                 exploit these fine-grained behaviors. They extend NVM
                 lifetimes by placing highly mutated objects in DRAM and
                 read-mostly objects in NVM. We implement two such
                 systems. (1) Kingsguard-nursery places new allocation
                 in DRAM and survivors in NVM, reducing NVM writes by 5$
                 \times $ versus NVM only with wear-leveling. (2)
                 Kingsguard-writers (KG-W) places nursery objects in
                 DRAM and survivors in a DRAM observer space. It
                 monitors all mature object writes and moves unwritten
                 mature objects from DRAM to NVM. Because most mature
                 objects are unwritten, KG-W exploits NVM capacity while
                 increasing NVM lifetimes by 11$ \times $. It reduces
                 the energy-delay product by 32\% over DRAM-only and
                 29\% over NVM-only. This work opens up new avenues for
                 making hybrid memories practical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Lin:2018:MSN,
  author =       "Chit-Kwan Lin and Andreas Wild and Gautham N. Chinya
                 and Tsung-Han Lin and Mike Davies and Hong Wang",
  title =        "Mapping spiking neural networks onto a manycore
                 neuromorphic architecture",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "78--89",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192371",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a compiler for Loihi, a novel manycore
                 neuromorphic processor that features a programmable,
                 on-chip learning engine for training and executing
                 spiking neural networks (SNNs). An SNN is distinguished
                 from other neural networks in that (1) its independent
                 computing units, or ``neurons'', communicate with
                 others only through spike messages; and (2) each neuron
                 evaluates local learning rules, which are functions of
                 spike arrival and departure timings, to modify its
                 local state. The collective neuronal state dynamics of
                 an SNN form a nonlinear dynamical system that can be
                 cast as an unconventional model of computation. To
                 realize such an SNN on Loihi requires each constituent
                 neuron to locally store and independently update its
                 own spike timing information. However, each Loihi core
                 has limited resources for this purpose and these must
                 be shared by neurons assigned to the same core. In this
                 work, we present a compiler for Loihi that maps the
                 neurons of an SNN onto and across Loihi's cores
                 efficiently. We show that a poor neuron-to-core mapping
                 can incur significant energy costs and address this
                 with a greedy algorithm that compiles SNNs onto Loihi
                 in a power-efficient manner. In so doing, we highlight
                 the need for further development of compilers for this
                 new, emerging class of architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Brutschy:2018:SSA,
  author =       "Lucas Brutschy and Dimitar Dimitrov and Peter
                 M{\"u}ller and Martin Vechev",
  title =        "Static serializability analysis for causal
                 consistency",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "90--104",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192415",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many distributed databases provide only weak
                 consistency guarantees to reduce synchronization
                 overhead and remain available under network partitions.
                 However, this leads to behaviors not possible under
                 stronger guarantees. Such behaviors can easily defy
                 programmer intuition and lead to errors that are
                 notoriously hard to detect. In this paper, we propose a
                 static analysis for detecting non-serializable
                 behaviors of applications running on top of
                 causally-consistent databases. Our technique is based
                 on a novel, local serializability criterion and
                 combines a generalization of graph-based techniques
                 from the database literature with another,
                 complementary analysis technique that encodes our
                 serializability criterion into first-order logic
                 formulas to be checked by an SMT solver. This analysis
                 is more expensive yet more precise and produces
                 concrete counter-examples. We implemented our methods
                 and evaluated them on a number of applications from two
                 different domains: cloud-backed mobile applications and
                 clients of a distributed database. Our experiments
                 demonstrate that our analysis is able to detect harmful
                 serializability violations while producing only a small
                 number of false alarms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Liu:2018:CIC,
  author =       "Peizun Liu and Thomas Wahl",
  title =        "{CUBA}: interprocedural {Context-UnBounded Analysis}
                 of concurrent programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "105--119",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192419",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A classical result by Ramalingam about
                 synchronization-sensitive interprocedural program
                 analysis implies that reachability for concurrent
                 threads running recursive procedures is undecidable. A
                 technique proposed by Qadeer and Rehof, to bound the
                 number of context switches allowed between the threads,
                 leads to an incomplete solution that is, however,
                 believed to catch ``most bugs'' in practice. The
                 question whether the technique can also prove the
                 absence of bugs at least in some cases has remained
                 largely open. In this paper we introduce a broad
                 verification methodology for resource-parameterized
                 programs that observes how changes to the resource
                 parameter affect the behavior of the program. Applied
                 to the context-unbounded analysis problem (CUBA), the
                 methodology results in partial verification techniques
                 for procedural concurrent programs. Our solutions may
                 not terminate, but are able to both refute and prove
                 context-unbounded safety for concurrent recursive
                 threads. We demonstrate the effectiveness of our method
                 using a variety of examples, the safe of which cannot
                 be proved safe by earlier, context-bounded methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Ferles:2018:SRA,
  author =       "Kostas Ferles and Jacob {Van Geffen} and Isil Dillig
                 and Yannis Smaragdakis",
  title =        "Symbolic reasoning for automatic signal placement",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "120--134",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192395",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Explicit signaling between threads is a perennial
                 cause of bugs in concurrent programs. While there are
                 several run-time techniques to automatically notify
                 threads upon the availability of some shared resource,
                 such techniques are not widely-adopted due to their
                 run-time overhead. This paper proposes a new solution
                 based on static analysis for automatically generating a
                 performant explicit-signal program from its
                 corresponding implicit-signal implementation. The key
                 idea is to generate verification conditions that allow
                 us to minimize the number of required signals and
                 unnecessary context switches, while guaranteeing
                 semantic equivalence between the source and target
                 programs. We have implemented our method in a tool
                 called Expresso and evaluate it on challenging
                 benchmarks from prior papers and open-source software.
                 Expresso-generated code significantly outperforms past
                 automatic signaling mechanisms (avg. 1.56x speedup) and
                 closely matches the performance of hand-optimized
                 explicit-signal code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Chen:2018:AAB,
  author =       "Yu-Fang Chen and Matthias Heizmann and Ondrej
                 Leng{\'a}l and Yong Li and Ming-Hsien Tsai and Andrea
                 Turrini and Lijun Zhang",
  title =        "Advanced automata-based algorithms for program
                 termination checking",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "135--150",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192405",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In 2014, Heizmann et al. proposed a novel framework
                 for program termination analysis. The analysis starts
                 with a termination proof of a sample path. The path is
                 generalized to a B{\"u}chi automaton (BA) whose
                 language (by construction) represents a set of
                 terminating paths. All these paths can be safely
                 removed from the program. The removal of paths is done
                 using automata difference, implemented via BA
                 complementation and intersection. The analysis
                 constructs in this way a set of BAs that jointly
                 ``cover'' the behavior of the program, thus proving its
                 termination. An implementation of the approach in
                 Ultimate Automizer won the 1st place in the Termination
                 category of SV-COMP 2017. In this paper, we exploit
                 advanced automata-based algorithms and propose several
                 non-trivial improvements of the framework. To alleviate
                 the complementation computation for BAs---one of the
                 most expensive operations in the framework---, we
                 propose a multi-stage generalization construction. We
                 start with generalizations producing subclasses of BAs
                 (such as deterministic BAs) for which efficient
                 complementation algorithms are known, and proceed to
                 more general classes only if necessary. Particularly,
                 we focus on the quite expressive subclass of
                 semideterministic BAs and provide an improved
                 complementation algorithm for this class. Our
                 experimental evaluation shows that the proposed
                 approach significantly improves the power of
                 termination checking within the Ultimate Automizer
                 framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Ottoni:2018:HJP,
  author =       "Guilherme Ottoni",
  title =        "{HHVM JIT}: a profile-guided, region-based compiler
                 for {PHP} and Hack",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "151--165",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192374",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic languages such as PHP, JavaScript, Python, and
                 Ruby have been gaining popularity over the last two
                 decades. A very popular domain for these languages is
                 web development, including server-side development of
                 large-scale websites. As a result, improving the
                 performance of these languages has become more
                 important. Efficiently compiling programs in these
                 languages is challenging, and many popular dynamic
                 languages still lack efficient production-quality
                 implementations. This paper describes the design of the
                 second generation of the HHVM JIT and how it addresses
                 the challenges to efficiently execute PHP and Hack
                 programs. This new design uses profiling to build an
                 aggressive region-based JIT compiler. We discuss the
                 benefits of this approach compared to the more popular
                 method-based and trace-based approaches to compile
                 dynamic languages. Our evaluation running a very large
                 PHP-based code base, the Facebook website, demonstrates
                 the effectiveness of the new JIT design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{DElia:2018:SRD,
  author =       "Daniele Cono D'Elia and Camil Demetrescu",
  title =        "On-stack replacement, distilled",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "166--180",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192396",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "On-stack replacement (OSR) is essential technology for
                 adaptive optimization, allowing changes to code
                 actively executing in a managed runtime. The
                 engineering aspects of OSR are well-known among VM
                 architects, with several implementations available to
                 date. However, OSR is yet to be explored as a general
                 means to transfer execution between related program
                 versions, which can pave the road to unprecedented
                 applications that stretch beyond VMs. We aim at filling
                 this gap with a constructive and provably correct OSR
                 framework, allowing a class of general-purpose
                 transformation functions to yield a special-purpose
                 replacement. We describe and evaluate an implementation
                 of our technique in LLVM. As a novel application of
                 OSR, we present a feasibility study on debugging of
                 optimized code, showing how our techniques can be used
                 to fix variables holding incorrect values at
                 breakpoints due to optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Duck:2018:ETM,
  author =       "Gregory J. Duck and Roland H. C. Yap",
  title =        "{EffectiveSan}: type and memory error detection using
                 dynamically typed {C\slash C++}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "181--195",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192388",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Low-level programming languages with weak/static type
                 systems, such as C and C++, are vulnerable to errors
                 relating to the misuse of memory at runtime, such as
                 (sub-)object bounds overflows, (re)use-after-free, and
                 type confusion. Such errors account for many security
                 and other undefined behavior bugs for programs written
                 in these languages. In this paper, we introduce the
                 notion of dynamically typed C/C++, which aims to detect
                 such errors by dynamically checking the ``effective
                 type'' of each object before use at runtime. We also
                 present an implementation of dynamically typed C/C++ in
                 the form of the Effective Type Sanitizer
                 (EffectiveSan). EffectiveSan enforces type and memory
                 safety using a combination of low-fat pointers, type
                 meta data and type/bounds check instrumentation. We
                 evaluate EffectiveSan against the SPEC2006 benchmark
                 suite and the Firefox web browser, and detect several
                 new type and memory errors. We also show that
                 EffectiveSan achieves high compatibility and reasonable
                 overheads for the given error coverage. Finally, we
                 highlight that EffectiveSan is one of only a few tools
                 that can detect sub-object bounds errors, and uses a
                 novel approach (dynamic type checking) to do so.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Cai:2018:CRC,
  author =       "Cheng Cai and Qirun Zhang and Zhiqiang Zuo and Khanh
                 Nguyen and Guoqing Xu and Zhendong Su",
  title =        "Calling-to-reference context translation via
                 constraint-guided {CFL}-reachability",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "196--210",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192378",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A calling context is an important piece of information
                 used widely to help developers understand program
                 executions (e.g., for debugging). While calling
                 contexts offer useful control information, information
                 regarding data involved in a bug (e.g., what data
                 structure holds a leaking object), in many cases, can
                 bring developers closer to the bug's root cause. Such
                 data information, often exhibited as heap reference
                 paths, has already been needed by many tools. The only
                 way for a dynamic analysis to record complete reference
                 paths is to perform heap dumping, which incurs huge
                 runtime overhead and renders the analysis impractical.
                 This paper presents a novel static analysis that can
                 precisely infer, from a calling context of a method
                 that contains a use (e.g., read or write) of an object,
                 the heap reference paths leading to the object at the
                 time the use occurs. Since calling context recording is
                 much less expensive, our technique provides benefits
                 for all dynamic techniques that need heap information,
                 significantly reducing their overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Chong:2018:STW,
  author =       "Nathan Chong and Tyler Sorensen and John Wickerson",
  title =        "The semantics of transactions and weak memory in x86,
                 {Power}, {ARM}, and {C++}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "211--225",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192373",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Weak memory models provide a complex, system-centric
                 semantics for concurrent programs, while transactional
                 memory (TM) provides a simpler, programmer-centric
                 semantics. Both have been studied in detail, but their
                 combined semantics is not well understood. This is
                 problematic because such widely-used architectures and
                 languages as x86, Power, and C++ all support TM, and
                 all have weak memory models. Our work aims to clarify
                 the interplay between weak memory and TM by extending
                 existing axiomatic weak memory models (x86, Power,
                 ARMv8, and C++) with new rules for TM. Our formal
                 models are backed by automated tooling that enables (1)
                 the synthesis of tests for validating our models
                 against existing implementations and (2) the
                 model-checking of TM-related transformations, such as
                 lock elision and compiling C++ transactions to
                 hardware. A key finding is that a proposed TM extension
                 to ARMv8 currently being considered within ARM Research
                 is incompatible with lock elision without sacrificing
                 portability or performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Milano:2018:MLM,
  author =       "Matthew Milano and Andrew C. Myers",
  title =        "{MixT}: a language for mixing consistency in
                 geodistributed transactions",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "226--241",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192375",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Programming concurrent, distributed systems is
                 hard-especially when these systems mutate shared,
                 persistent state replicated at geographic scale. To
                 enable high availability and scalability, a new class
                 of weakly consistent data stores has become popular.
                 However, some data needs strong consistency. To
                 manipulate both weakly and strongly consistent data in
                 a single transaction, we introduce a new abstraction:
                 mixed-consistency transactions, embodied in a new
                 embedded language, MixT. Programmers explicitly
                 associate consistency models with remote storage sites;
                 each atomic, isolated transaction can access a mixture
                 of data with different consistency models. Compile-time
                 information-flow checking, applied to consistency
                 models, ensures that these models are mixed safely and
                 enables the compiler to automatically partition
                 transactions. New run-time mechanisms ensure that
                 consistency models can also be mixed safely, even when
                 the data used by a transaction resides on separate,
                 mutually unaware stores. Performance measurements show
                 that despite their stronger guarantees,
                 mixed-consistency transactions retain much of the speed
                 of weak consistency, significantly outperforming
                 traditional serializable transactions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Dolan:2018:BDR,
  author =       "Stephen Dolan and KC Sivaramakrishnan and Anil
                 Madhavapeddy",
  title =        "Bounding data races in space and time",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "242--255",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192421",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a new semantics for shared-memory parallel
                 programs that gives strong guarantees even in the
                 presence of data races. Our local data race freedom
                 property guarantees that all data-race-free portions of
                 programs exhibit sequential semantics. We provide a
                 straightforward operational semantics and an equivalent
                 axiomatic model, and evaluate an implementation for the
                 OCaml programming language. Our evaluation demonstrates
                 that it is possible to balance a comprehensible memory
                 model with a reasonable (no overhead on x86, ~0.6\% on
                 ARM) sequential performance trade-off in a mainstream
                 programming language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Sanchez-Stern:2018:FRC,
  author =       "Alex Sanchez-Stern and Pavel Panchekha and Sorin
                 Lerner and Zachary Tatlock",
  title =        "Finding root causes of floating point error",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "256--269",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192411",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Floating-point arithmetic plays a central role in
                 science, engineering, and finance by enabling
                 developers to approximate real arithmetic. To address
                 numerical issues in large floating-point applications,
                 developers must identify root causes, which is
                 difficult because floating-point errors are generally
                 non-local, non-compositional, and non-uniform. This
                 paper presents Herbgrind, a tool to help developers
                 identify and address root causes in numerical code
                 written in low-level languages like C/C++ and Fortran.
                 Herbgrind dynamically tracks dependencies between
                 operations and program outputs to avoid false positives
                 and abstracts erroneous computations to simplified
                 program fragments whose improvement can reduce output
                 error. We perform several case studies applying
                 Herbgrind to large, expert-crafted numerical programs
                 and show that it scales to applications spanning
                 hundreds of thousands of lines, correctly handling the
                 low-level details of modern floating point hardware and
                 mathematical libraries and tracking error across
                 function boundaries and through the heap.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Adams:2018:RFF,
  author =       "Ulf Adams",
  title =        "{Ry{\=u}}: fast float-to-string conversion",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "270--282",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192369",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present Ry{\=u}, a new routine to convert binary
                 floating point numbers to their decimal representations
                 using only fixed-size integer operations, and prove its
                 correctness. Ry{\=u} is simpler and approximately three
                 times faster than the previously fastest
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  keywords =     "base conversion; input-output conversion; radix
                 conversion; round-trip base conversion",
  remark =       "PLDI '18 proceedings.",
}

@Article{Steindorfer:2018:MOA,
  author =       "Michael J. Steindorfer and Jurgen J. Vinju",
  title =        "To-many or to-one? {All}-in-one! {Efficient} purely
                 functional multi-maps with type-heterogeneous
                 hash-tries",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "283--295",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192420",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "An immutable multi-map is a many-to-many map data
                 structure with expected fast insert and lookup
                 operations. This data structure is used for
                 applications processing graphs or many-to-many
                 relations as applied in compilers, runtimes of
                 programming languages, or in static analysis of
                 object-oriented systems. Collection data structures are
                 assumed to carefully balance execution time of
                 operations with memory consumption characteristics and
                 need to scale gracefully from a few elements to
                 multiple gigabytes at least. When processing larger
                 in-memory data sets the overhead of the data structure
                 encoding itself becomes a memory usage bottleneck,
                 dominating the overall performance. In this paper we
                 propose AXIOM, a novel hash-trie data structure that
                 allows for a highly efficient and type-safe multi-map
                 encoding by distinguishing inlined values of singleton
                 sets from nested sets of multi-mappings. AXIOM strictly
                 generalizes over previous hash-trie data structures by
                 supporting the processing of fine-grained
                 type-heterogeneous content on the implementation level
                 (while API and language support for type-heterogeneity
                 are not scope of this paper). We detail the design and
                 optimizations of AXIOM and further compare it against
                 state-of-the-art immutable maps and multi-maps in Java,
                 Scala and Clojure. We isolate key differences using
                 microbenchmarks and validate the resulting conclusions
                 on a case study in static analysis. AXIOM reduces the
                 key-value storage overhead by 1.87x; with specializing
                 and inlining across collection boundaries it improves
                 by 5.1x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Koeplinger:2018:SLC,
  author =       "David Koeplinger and Matthew Feldman and Raghu
                 Prabhakar and Yaqi Zhang and Stefan Hadjis and Ruben
                 Fiszel and Tian Zhao and Luigi Nardi and Ardavan Pedram
                 and Christos Kozyrakis and Kunle Olukotun",
  title =        "{Spatial}: a language and compiler for application
                 accelerators",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "296--311",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192379",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Industry is increasingly turning to reconfigurable
                 architectures like FPGAs and CGRAs for improved
                 performance and energy efficiency. Unfortunately,
                 adoption of these architectures has been limited by
                 their programming models. HDLs lack abstractions for
                 productivity and are difficult to target from higher
                 level languages. HLS tools are more productive, but
                 offer an ad-hoc mix of software and hardware
                 abstractions which make performance optimizations
                 difficult. In this work, we describe a new
                 domain-specific language and compiler called Spatial
                 for higher level descriptions of application
                 accelerators. We describe Spatial's hardware-centric
                 abstractions for both programmer productivity and
                 design performance, and summarize the compiler passes
                 required to support these abstractions, including
                 pipeline scheduling, automatic memory banking, and
                 automated design tuning driven by active machine
                 learning. We demonstrate the language's ability to
                 target FPGAs and CGRAs from common source code. We show
                 that applications written in Spatial are, on average,
                 42\% shorter and achieve a mean speedup of 2.9x over
                 SDAccel HLS when targeting a Xilinx UltraScale+ VU9P
                 FPGA on an Amazon EC2 F1 instance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Kislal:2018:ECC,
  author =       "Orhan Kislal and Jagadish Kotra and Xulong Tang and
                 Mahmut Taylan Kandemir and Myoungsoo Jung",
  title =        "Enhancing computation-to-core assignment with physical
                 location information",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "312--327",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192386",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Going beyond a certain number of cores in modern
                 architectures requires an on-chip network more scalable
                 than conventional buses. However, employing an on-chip
                 network in a manycore system (to improve scalability)
                 makes the latencies of the data accesses issued by a
                 core non-uniform. This non-uniformity can play a
                 significant role in shaping the overall application
                 performance. This work presents a novel compiler
                 strategy which involves exposing architecture
                 information to the compiler to enable an optimized
                 computation-to-core mapping. Specifically, we propose a
                 compiler-guided scheme that takes into account the
                 relative positions of (and distances between) cores,
                 last-level caches (LLCs) and memory controllers (MCs)
                 in a manycore system, and generates a mapping of
                 computations to cores with the goal of minimizing the
                 on-chip network traffic. The experimental data
                 collected using a set of 21 multi-threaded applications
                 reveal that, on an average, our approach reduces the
                 on-chip network latency in a 6$ \times $6 manycore
                 system by 38.4\% in the case of private LLCs, and
                 43.8\% in the case of shared LLCs. These improvements
                 translate to the corresponding execution time
                 improvements of 10.9\% and 12.7\% for the private LLC
                 and shared LLC based systems, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Tran:2018:SSH,
  author =       "Kim-Anh Tran and Alexandra Jimborean and Trevor E.
                 Carlson and Konstantinos Koukos and Magnus
                 Sj{\"a}lander and Stefanos Kaxiras",
  title =        "{SWOOP}: software-hardware co-design for
                 non-speculative, execute-ahead, in-order cores",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "328--343",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192393",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Increasing demands for energy efficiency constrain
                 emerging hardware. These new hardware trends challenge
                 the established assumptions in code generation and
                 force us to rethink existing software optimization
                 techniques. We propose a cross-layer redesign of the
                 way compilers and the underlying microarchitecture are
                 built and interact, to achieve both performance and
                 high energy efficiency. In this paper, we address one
                 of the main performance bottlenecks --- last-level
                 cache misses --- through a software-hardware co-design.
                 Our approach is able to hide memory latency and attain
                 increased memory and instruction level parallelism by
                 orchestrating a non-speculative, execute-ahead paradigm
                 in software (SWOOP). While out-of-order (OoO)
                 architectures attempt to hide memory latency by
                 dynamically reordering instructions, they do so through
                 expensive, power-hungry, speculative mechanisms.We aim
                 to shift this complexity into software, and we build
                 upon compilation techniques inherited from VLIW,
                 software pipelining, modulo scheduling, decoupled
                 access-execution, and software prefetching. In contrast
                 to previous approaches we do not rely on either
                 software or hardware speculation that can be
                 detrimental to efficiency. Our SWOOP compiler is
                 enhanced with lightweight architectural support, thus
                 being able to transform applications that include
                 highly complex control-flow and indirect memory
                 accesses.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Liu:2018:ISI,
  author =       "Hongyu Liu and Sam Silvestro and Wei Wang and Chen
                 Tian and Tongping Liu",
  title =        "{iReplayer}: in-situ and identical record-and-replay
                 for multithreaded applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "344--358",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192380",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Reproducing executions of multithreaded programs is
                 very challenging due to many intrinsic and external
                 non-deterministic factors. Existing RnR systems achieve
                 significant progress in terms of performance overhead,
                 but none targets the in-situ setting, in which replay
                 occurs within the same process as the recording
                 process. Also, most existing work cannot achieve
                 identical replay, which may prevent the reproduction of
                 some errors. This paper presents iReplayer, which aims
                 to identically replay multithreaded programs in the
                 original process (under the ``in-situ'' setting). The
                 novel in-situ and identical replay of iReplayer makes
                 it more likely to reproduce errors, and allows it to
                 directly employ debugging mechanisms (e.g. watchpoints)
                 to aid failure diagnosis. Currently, iReplayer only
                 incurs 3\% performance overhead on average, which
                 allows it to be always enabled in the production
                 environment. iReplayer enables a range of
                 possibilities, and this paper presents three examples:
                 two automatic tools for detecting buffer overflows and
                 use-after-free bugs, and one interactive debugging tool
                 that is integrated with GDB.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Liu:2018:DFC,
  author =       "Bozhen Liu and Jeff Huang",
  title =        "{D4}: fast concurrency debugging with parallel
                 differential analysis",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "359--373",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192390",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present D4, a fast concurrency analysis framework
                 that detects concurrency bugs (e.g., data races and
                 deadlocks) interactively in the programming phase. As
                 developers add, modify, and remove statements, the code
                 changes are sent to D4 to detect concurrency bugs in
                 real time, which in turn provides immediate feedback to
                 the developer of the new bugs. The cornerstone of D4
                 includes a novel system design and two novel parallel
                 differential algorithms that embrace both change and
                 parallelization for fundamental static analyses of
                 concurrent programs. Both algorithms react to program
                 changes by memoizing the analysis results and only
                 recomputing the impact of a change in parallel. Our
                 evaluation on an extensive collection of large
                 real-world applications shows that D4 efficiently
                 pinpoints concurrency bugs within 100ms on average
                 after a code change, several orders of magnitude faster
                 than both the exhaustive analysis and the
                 state-of-the-art incremental techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Roemer:2018:HCU,
  author =       "Jake Roemer and Kaan Gen{\c{c}} and Michael D. Bond",
  title =        "High-coverage, unbounded sound predictive race
                 detection",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "374--389",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192385",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic program analysis can predict data races
                 knowable from an observed execution, but existing
                 predictive analyses either miss races or cannot analyze
                 full program executions. This paper presents
                 Vindicator, a novel, sound (no false races) predictive
                 approach that finds more data races than existing
                 predictive approaches. Vindicator achieves high
                 coverage by using a new, efficient analysis that finds
                 all possible predictable races but may detect false
                 races. Vindicator ensures soundness using a novel
                 algorithm that checks each potential race to determine
                 whether it is a true predictable race. An evaluation
                 using large Java programs shows that Vindicator finds
                 hard-to-detect predictable races that existing sound
                 predictive analyses miss, at a comparable performance
                 cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Peng:2018:CDC,
  author =       "Yuanfeng Peng and Vinod Grover and Joseph Devietti",
  title =        "{CURD}: a dynamic {CUDA} race detector",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "390--403",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192368",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "As GPUs have become an integral part of nearly every
                 processor, GPU programming has become increasingly
                 popular. GPU programming requires a combination of
                 extreme levels of parallelism and low-level
                 programming, making it easy for concurrency bugs such
                 as data races to arise. These concurrency bugs can be
                 extremely subtle and di cult to debug due to the
                 massive numbers of threads running concurrently on a
                 modern GPU. While some tools exist to detect data races
                 in GPU programs, they are often prohibitively slow or
                 focused only on a small class of data races in shared
                 memory. Compared to prior work, our race detector,
                 CURD, can detect data races precisely on both shared
                 and global memory, selects an appropriate race
                 detection algorithm based on the synchronization used
                 in a program, and utilizes efficient compiler
                 instrumentation to reduce performance overheads. Across
                 53 benchmarks, we find that using CURD incurs an aver-
                 age slowdown of just 2.88x over native execution. CURD
                 is 2.1x faster than Nvidia's CUDA-Racecheck race
                 detector, despite detecting a much broader class of
                 races. CURD finds 35 races across our benchmarks,
                 including bugs in established benchmark suites and in
                 sample programs from Nvidia.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Alon:2018:GPB,
  author =       "Uri Alon and Meital Zilberstein and Omer Levy and Eran
                 Yahav",
  title =        "A general path-based representation for predicting
                 program properties",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "404--419",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192412",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/csharp.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Predicting program properties such as names or
                 expression types has a wide range of applications. It
                 can ease the task of programming, and increase
                 programmer productivity. A major challenge when
                 learning from programs is how to represent programs in
                 a way that facilitates effective learning. We present a
                 general path-based representation for learning from
                 programs. Our representation is purely syntactic and
                 extracted automatically. The main idea is to represent
                 a program using paths in its abstract syntax tree
                 (AST). This allows a learning model to leverage the
                 structured nature of code rather than treating it as a
                 flat sequence of tokens. We show that this
                 representation is general and can: (i) cover different
                 prediction tasks, (ii) drive different learning
                 algorithms (for both generative and discriminative
                 models), and (iii) work across different programming
                 languages. We evaluate our approach on the tasks of
                 predicting variable names, method names, and full
                 types. We use our representation to drive both
                 CRF-based and word2vec-based learning, for programs of
                 four languages: JavaScript, Java, Python and C\#. Our
                 evaluation shows that our approach obtains better
                 results than task-specific handcrafted representations
                 across different tasks and programming languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Feng:2018:PSU,
  author =       "Yu Feng and Ruben Martins and Osbert Bastani and Isil
                 Dillig",
  title =        "Program synthesis using conflict-driven learning",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "420--435",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192382",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a new conflict-driven program synthesis
                 technique that is capable of learning from past
                 mistakes. Given a spurious program that violates the
                 desired specification, our synthesis algorithm
                 identifies the root cause of the conflict and learns
                 new lemmas that can prevent similar mistakes in the
                 future. Specifically, we introduce the notion of
                 equivalence modulo conflict and show how this idea can
                 be used to learn useful lemmas that allow the
                 synthesizer to prune large parts of the search space.
                 We have implemented a general-purpose CDCL-style
                 program synthesizer called Neo and evaluate it in two
                 different application domains, namely data wrangling in
                 R and functional programming over lists. Our
                 experiments demonstrate the substantial benefits of
                 conflict-driven learning and show that Neo outperforms
                 two state-of-the-art synthesis tools, Morpheus and
                 Deepcoder, that target these respective domains.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Lee:2018:ASB,
  author =       "Woosuk Lee and Kihong Heo and Rajeev Alur and Mayur
                 Naik",
  title =        "Accelerating search-based program synthesis using
                 learned probabilistic models",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "436--449",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192410",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A key challenge in program synthesis concerns how to
                 efficiently search for the desired program in the space
                 of possible programs. We propose a general approach to
                 accelerate search-based program synthesis by biasing
                 the search towards likely programs. Our approach
                 targets a standard formulation, syntax-guided synthesis
                 (SyGuS), by extending the grammar of possible programs
                 with a probabilistic model dictating the likelihood of
                 each program. We develop a weighted search algorithm to
                 efficiently enumerate programs in order of their
                 likelihood. We also propose a method based on transfer
                 learning that enables to effectively learn a powerful
                 model, called probabilistic higher-order grammar, from
                 known solutions in a domain. We have implemented our
                 approach in a tool called Euphony and evaluate it on
                 SyGuS benchmark problems from a variety of domains. We
                 show that Euphony can learn good models using easily
                 obtainable solutions, and achieves significant
                 performance gains over existing general-purpose as well
                 as domain-specific synthesizers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Paletov:2018:ICA,
  author =       "Rumen Paletov and Petar Tsankov and Veselin Raychev
                 and Martin Vechev",
  title =        "Inferring crypto {API} rules from code changes",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "450--464",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192403",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Creating and maintaining an up-to-date set of security
                 rules that match misuses of crypto APIs is challenging,
                 as crypto APIs constantly evolve over time with new
                 cryptographic primitives and settings, making existing
                 ones obsolete. To address this challenge, we present a
                 new approach to extract security fixes from thousands
                 of code changes. Our approach consists of: (i)
                 identifying code changes, which often capture security
                 fixes, (ii) an abstraction that filters irrelevant code
                 changes (such as refactorings), and (iii) a clustering
                 analysis that reveals commonalities between semantic
                 code changes and helps in eliciting security rules. We
                 applied our approach to the Java Crypto API and showed
                 that it is effective: (i) our abstraction effectively
                 filters non-semantic code changes (over 99\% of all
                 changes) without removing security fixes, and (ii) over
                 80\% of the code changes are security fixes identifying
                 security rules. Based on our results, we identified 13
                 rules, including new ones not supported by existing
                 security checkers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Gulwani:2018:ACP,
  author =       "Sumit Gulwani and Ivan Radicek and Florian Zuleger",
  title =        "Automated clustering and program repair for
                 introductory programming assignments",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "465--480",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192387",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Providing feedback on programming assignments is a
                 tedious task for the instructor, and even impossible in
                 large Massive Open Online Courses with thousands of
                 students. Previous research has suggested that program
                 repair techniques can be used to generate feedback in
                 programming education. In this paper, we present a
                 novel fully automated program repair algorithm for
                 introductory programming assignments. The key idea of
                 the technique, which enables automation and
                 scalability, is to use the existing correct student
                 solutions to repair the incorrect attempts. We evaluate
                 the approach in two experiments: (I) We evaluate the
                 number, size and quality of the generated repairs on
                 4,293 incorrect student attempts from an existing MOOC.
                 We find that our approach can repair 97\% of student
                 attempts, while 81\% of those are small repairs of good
                 quality. (II) We conduct a preliminary user study on
                 performance and repair usefulness in an interactive
                 teaching setting. We obtain promising initial results
                 (the average usefulness grade 3.4 on a scale from 1 to
                 5), and conclude that our approach can be used in an
                 interactive setting.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Wang:2018:SAR,
  author =       "Ke Wang and Rishabh Singh and Zhendong Su",
  title =        "Search, align, and repair: data-driven feedback
                 generation for introductory programming exercises",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "481--495",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192384",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces the ``Search, Align, and
                 Repair'' data-driven program repair framework to
                 automate feedback generation for introductory
                 programming exercises. Distinct from existing
                 techniques, our goal is to develop an efficient, fully
                 automated, and problem-agnostic technique for large or
                 MOOC-scale introductory programming courses. We
                 leverage the large amount of available student
                 submissions in such settings and develop new algorithms
                 for identifying similar programs, aligning correct and
                 incorrect programs, and repairing incorrect programs by
                 finding minimal fixes. We have implemented our
                 technique in the Sarfgen system and evaluated it on
                 thousands of real student attempts from the
                 Microsoft-DEV204.1x edX course and the Microsoft
                 CodeHunt platform. Our results show that Sarfgen can,
                 within two seconds on average, generate concise, useful
                 feedback for 89.7\% of the incorrect student
                 submissions. It has been integrated with the
                 Microsoft-DEV204.1X edX class and deployed for
                 production use.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Ngo:2018:BER,
  author =       "Van Chan Ngo and Quentin Carbonneaux and Jan
                 Hoffmann",
  title =        "Bounded expectations: resource analysis for
                 probabilistic programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "496--512",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192394",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper presents a new static analysis for deriving
                 upper bounds on the expected resource consumption of
                 probabilistic programs. The analysis is fully automatic
                 and derives symbolic bounds that are multivariate
                 polynomials in the inputs. The new technique combines
                 manual state-of-the-art reasoning techniques for
                 probabilistic programs with an effective method for
                 automatic resource-bound analysis of deterministic
                 programs. It can be seen as both, an extension of
                 automatic amortized resource analysis (AARA) to
                 probabilistic programs and an automation of manual
                 reasoning for probabilistic programs that is based on
                 weakest preconditions. An advantage of the technique is
                 that it combines the clarity and compositionality of a
                 weakest-precondition calculus with the efficient
                 automation of AARA. As a result, bound inference can be
                 reduced to off-the-shelf LP solving in many cases and
                 automatically-derived bounds can be interactively
                 extended with standard program logics if the automation
                 fails. Building on existing work, the soundness of the
                 analysis is proved with respect to an operational
                 semantics that is based on Markov decision processes.
                 The effectiveness of the technique is demonstrated with
                 a prototype implementation that is used to
                 automatically analyze 39 challenging probabilistic
                 programs and randomized algorithms. Experiments
                 indicate that the derived constant factors in the
                 bounds are very precise and even optimal for some
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Wang:2018:PAF,
  author =       "Di Wang and Jan Hoffmann and Thomas Reps",
  title =        "{PMAF}: an algebraic framework for static analysis of
                 probabilistic programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "513--528",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192408",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Automatically establishing that a probabilistic
                 program satisfies some property $ \varphi $ is a
                 challenging problem. While a sampling-based
                 approach-which involves running the program
                 repeatedly-can suggest that $ \varphi $ holds, to
                 establish that the program satisfies $ \varphi $ ,
                 analysis techniques must be used. Despite recent
                 successes, probabilistic static analyses are still more
                 difficult to design and implement than their
                 deterministic counterparts. This paper presents a
                 framework, called PMAF, for designing, implementing,
                 and proving the correctness of static analyses of
                 probabilistic programs with challenging features such
                 as recursion, unstructured control-flow, divergence,
                 nondeterminism, and continuous distributions. PMAF
                 introduces pre-Markov algebras to factor out common
                 parts of different analyses. To perform interprocedural
                 analysis and to create procedure summaries, PMAF
                 extends ideas from non-probabilistic interprocedural
                 dataflow analysis to the probabilistic setting. One
                 novelty is that PMAF is based on a semantics formulated
                 in terms of a control-flow hyper-graph for each
                 procedure, rather than a standard control-flow graph.
                 To evaluate its effectiveness, PMAF has been used to
                 reformulate and implement existing intra procedural
                 analyses for Bayesian-inference and the Markov decision
                 problem, by creating corresponding inter procedural
                 analyses. Additionally, PMAF has been used to implement
                 a new interprocedural linear expectation-invariant
                 analysis. Experiments with benchmark programs for the
                 three analyses demonstrate that the approach is
                 practical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Acharya:2018:PAT,
  author =       "Aravind Acharya and Uday Bondhugula and Albert Cohen",
  title =        "Polyhedral auto-transformation with no integer linear
                 programming",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "529--542",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192401",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "State-of-the-art algorithms used in automatic
                 polyhedral transformation for parallelization and
                 locality optimization typically rely on Integer Linear
                 Programming (ILP). This poses a scalability issue when
                 scaling to tens or hundreds of statements, and may be
                 disconcerting in production compiler settings. In this
                 work, we consider relaxing integrality in the ILP
                 formulation of the Pluto algorithm, a popular algorithm
                 used to find good affine transformations. We show that
                 the rational solutions obtained from the relaxed LP
                 formulation can easily be scaled to valid integral ones
                 to obtain desired solutions, although with some
                 caveats. We first present formal results connecting the
                 solution of the relaxed LP to the original Pluto ILP.
                 We then show that there are difficulties in realizing
                 the above theoretical results in practice, and propose
                 an alternate approach to overcome those while still
                 leveraging linear programming. Our new approach obtains
                 dramatic compile-time speedups for a range of large
                 benchmarks. While achieving these compile-time
                 improvements, we show that the performance of the
                 transformed code is not sacrificed. Our approach to
                 automatic transformation provides a mean compilation
                 time improvement of 5.6$ \times $ over state-of-the-art
                 on relevant challenging benchmarks from the NAS PB,
                 SPEC CPU 2006, and PolyBench suites. We also came
                 across situations where prior frameworks failed to find
                 a transformation in a reasonable amount of time, while
                 our new approach did so instantaneously.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Moll:2018:PCF,
  author =       "Simon Moll and Sebastian Hack",
  title =        "Partial control-flow linearization",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "543--556",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192413",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "If-conversion is a fundamental technique for
                 vectorization. It accounts for the fact that in a SIMD
                 program, several targets of a branch might be executed
                 because of divergence. Especially for irregular
                 data-parallel workloads, it is crucial to avoid
                 if-converting non-divergent branches to increase SIMD
                 utilization. In this paper, we present partial
                 linearization, a simple and efficient if-conversion
                 algorithm that overcomes several limitations of
                 existing if-conversion techniques. In contrast to prior
                 work, it has provable guarantees on which non-divergent
                 branches are retained and will never duplicate code or
                 insert additional branches. We show how our algorithm
                 can be used in a classic loop vectorizer as well as to
                 implement data-parallel languages such as ISPC or
                 OpenCL. Furthermore, we implement prior vectorizer
                 optimizations on top of partial linearization in a more
                 general way. We evaluate the implementation of our
                 algorithm in LLVM on a range of irregular data
                 analytics kernels, a neutronics simulation benchmark
                 and NAB, a molecular dynamics benchmark from SPEC2017
                 on AVX2, AVX512, and ARM Advanced SIMD machines and
                 report speedups of up to 146 \% over ICC, GCC and Clang
                 O3.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Chen:2018:LAT,
  author =       "Dong Chen and Fangzhou Liu and Chen Ding and Sreepathi
                 Pai",
  title =        "Locality analysis through static parallel sampling",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "557--570",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192402",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Locality analysis is important since accessing memory
                 is much slower than computing. Compile-time locality
                 analysis can provide detailed program-level feedback
                 for compilers or runtime systems faster than
                 trace-based locality analysis. In this paper, we
                 describe a new approach to locality analysis based on
                 static parallel sampling. A compiler analyzes
                 loop-based code and generates sampler code which is run
                 to measure locality. Our approach can predict precise
                 cache line granularity miss ratio curves for complex
                 loops with non-linear array references and even
                 branches. The precision and overhead of static sampling
                 are evaluated using PolyBench and a bit-reversal loop.
                 Our result shows that by randomly sampling 2\% of loop
                 iterations, a compiler can construct almost exact miss
                 ratio curves as trace based analysis. Sampling 0.5\%
                 and 1\% iterations can achieve good precision and
                 efficiency with an average 0.6\% to 1\% the time of
                 tracing respectively. Our analysis can also be
                 parallelized. The analysis may assist program
                 optimization techniques such as tiling, program
                 co-location, cache hint selection and help to analyze
                 write locality and parallel locality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Cusumano-Towner:2018:IIP,
  author =       "Marco Cusumano-Towner and Benjamin Bichsel and Timon
                 Gehr and Martin Vechev and Vikash K. Mansinghka",
  title =        "Incremental inference for probabilistic programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "571--585",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192399",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a novel approach for approximate sampling
                 in probabilistic programs based on incremental
                 inference. The key idea is to adapt the samples for a
                 program P into samples for a program Q, thereby
                 avoiding the expensive sampling computation for program
                 Q. To enable incremental inference in probabilistic
                 programming, our work: (i) introduces the concept of a
                 trace translator which adapts samples from P into
                 samples of Q, (ii) phrases this translation approach in
                 the context of sequential Monte Carlo (SMC), which
                 gives theoretical guarantees that the adapted samples
                 converge to the distribution induced by Q, and (iii)
                 shows how to obtain a concrete trace translator by
                 establishing a correspondence between the random
                 choices of the two probabilistic programs. We
                 implemented our approach in two different probabilistic
                 programming systems and showed that, compared to
                 methods that sample the program Q from scratch,
                 incremental inference can lead to orders of magnitude
                 increase in efficiency, depending on how closely
                 related P and Q are.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Gehr:2018:BPI,
  author =       "Timon Gehr and Sasa Misailovic and Petar Tsankov and
                 Laurent Vanbever and Pascal Wiesmann and Martin
                 Vechev",
  title =        "{Bayonet}: probabilistic inference for networks",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "586--602",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192400",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Network operators often need to ensure that important
                 probabilistic properties are met, such as that the
                 probability of network congestion is below a certain
                 threshold. Ensuring such properties is challenging and
                 requires both a suitable language for probabilistic
                 networks and an automated procedure for answering
                 probabilistic inference queries. We present Bayonet, a
                 novel approach that consists of: (i) a probabilistic
                 network programming language and (ii) a system that
                 performs probabilistic inference on Bayonet programs.
                 The key insight behind Bayonet is to phrase the problem
                 of probabilistic network reasoning as inference in
                 existing probabilistic languages. As a result, Bayonet
                 directly leverages existing probabilistic inference
                 systems and offers a flexible and expressive interface
                 to operators. We present a detailed evaluation of
                 Bayonet on common network scenarios, such as network
                 congestion, reliability of packet delivery, and others.
                 Our results indicate that Bayonet can express such
                 practical scenarios and answer queries for realistic
                 topology sizes (with up to 30 nodes).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Mansinghka:2018:PPP,
  author =       "Vikash K. Mansinghka and Ulrich Schaechtle and Shivam
                 Handa and Alexey Radul and Yutian Chen and Martin
                 Rinard",
  title =        "Probabilistic programming with programmable
                 inference",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "603--616",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192409",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We introduce inference metaprogramming for
                 probabilistic programming languages, including new
                 language constructs, a formalism, and the rst
                 demonstration of effectiveness in practice. Instead of
                 relying on rigid black-box inference algorithms
                 hard-coded into the language implementation as in
                 previous probabilistic programming languages, inference
                 metaprogramming enables developers to (1) dynamically
                 decompose inference problems into subproblems, (2)
                 apply inference tactics to subproblems, (3) alternate
                 between incorporating new data and performing inference
                 over existing data, and (4) explore multiple execution
                 traces of the probabilistic program at once.
                 Implemented tactics include gradient-based
                 optimization, Markov chain Monte Carlo, variational
                 inference, and sequental Monte Carlo techniques.
                 Inference metaprogramming enables the concise
                 expression of probabilistic models and inference
                 algorithms across diverse elds, such as computer
                 vision, data science, and robotics, within a single
                 probabilistic programming language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Bohrer:2018:VVC,
  author =       "Brandon Bohrer and Yong Kiam Tan and Stefan Mitsch and
                 Magnus O. Myreen and Andr{\'e} Platzer",
  title =        "{VeriPhy}: verified controller executables from
                 verified cyber--physical system models",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "617--630",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192406",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present VeriPhy, a verified pipeline which
                 automatically transforms verified high-level models of
                 safety-critical cyber-physical systems (CPSs) in
                 differential dynamic logic (dL) to verified controller
                 executables. VeriPhy proves that all safety results are
                 preserved end-to-end as it bridges abstraction gaps,
                 including: (i) the gap between mathematical reals in
                 physical models and machine arithmetic in the
                 implementation, (ii) the gap between real physics and
                 its differential-equation models, and (iii) the gap
                 between nondeterministic controller models and machine
                 code. VeriPhy reduces CPS safety to the faithfulness of
                 the physical environment, which is checked at runtime
                 by synthesized, verified monitors. We use three provers
                 in this effort: KeYmaera X, HOL4, and Isabelle/HOL. To
                 minimize the trusted base, we cross-verify KeYmaeraX in
                 Isabelle/HOL. We evaluate the resulting controller and
                 monitors on commodity robotics hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Kang:2018:CVC,
  author =       "Jeehoon Kang and Yoonseung Kim and Youngju Song and
                 Juneyoung Lee and Sanghoon Park and Mark Dongyeon Shin
                 and Yonghyun Kim and Sungkeun Cho and Joonwon Choi and
                 Chung-Kil Hur and Kwangkeun Yi",
  title =        "{Crellvm}: verified credible compilation for {LLVM}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "631--645",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192377",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Production compilers such as GCC and LLVM are large
                 complex software systems, for which achieving a high
                 level of reliability is hard. Although testing is an
                 effective method for finding bugs, it alone cannot
                 guarantee a high level of reliability. To provide a
                 higher level of reliability, many approaches that
                 examine compilers' internal logics have been proposed.
                 However, none of them have been successfully applied to
                 major optimizations of production compilers. This paper
                 presents Crellvm: a verified credible compilation
                 framework for LLVM, which can be used as a systematic
                 way of providing a high level of reliability for major
                 optimizations in LLVM. Specifically, we augment an LLVM
                 optimizer to generate translation results together with
                 their correctness proofs, which can then be checked by
                 a proof checker formally verified in Coq. As case
                 studies, we applied our approach to two major
                 optimizations of LLVM: register promotion mem2reg and
                 global value numbering gvn, having found four new
                 miscompilation bugs (two in each).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Gu:2018:CCA,
  author =       "Ronghui Gu and Zhong Shao and Jieung Kim and Xiongnan
                 (Newman) Wu and J{\'e}r{\'e}mie Koenig and Vilhelm
                 Sj{\"o}berg and Hao Chen and David Costanzo and Tahina
                 Ramananandro",
  title =        "Certified concurrent abstraction layers",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "646--661",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192381",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Concurrent abstraction layers are ubiquitous in modern
                 computer systems because of the pervasiveness of
                 multithreaded programming and multicore hardware.
                 Abstraction layers are used to hide the implementation
                 details (e.g., fine-grained synchronization) and reduce
                 the complex dependencies among components at different
                 levels of abstraction. Despite their obvious
                 importance, concurrent abstraction layers have not been
                 treated formally. This severely limits the
                 applicability of layer-based techniques and makes it
                 difficult to scale verification across multiple
                 concurrent layers. In this paper, we present CCAL---a
                 fully mechanized programming toolkit developed under
                 the CertiKOS project---for specifying, composing,
                 compiling, and linking certified concurrent abstraction
                 layers. CCAL consists of three technical novelties: a
                 new game-theoretical, strategy-based compositional
                 semantic model for concurrency (and its associated
                 program verifiers), a set of formal linking theorems
                 for composing multithreaded and multicore concurrent
                 layers, and a new CompCertX compiler that supports
                 certified thread-safe compilation and linking. The CCAL
                 toolkit is implemented in Coq and supports layered
                 concurrent programming in both C and assembly. It has
                 been successfully applied to build a fully certified
                 concurrent OS kernel with fine-grained locking.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Taube:2018:MDD,
  author =       "Marcelo Taube and Giuliano Losa and Kenneth L.
                 McMillan and Oded Padon and Mooly Sagiv and Sharon
                 Shoham and James R. Wilcox and Doug Woos",
  title =        "Modularity for decidability of deductive verification
                 with applications to distributed systems",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "662--677",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192414",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Proof automation can substantially increase
                 productivity in formal verification of complex systems.
                 However, unpredictablility of automated provers in
                 handling quantified formulas presents a major hurdle to
                 usability of these tools. We propose to solve this
                 problem not by improving the provers, but by using a
                 modular proof methodology that allows us to produce
                 decidable verification conditions. Decidability greatly
                 improves predictability of proof automation, resulting
                 in a more practical verification approach. We apply
                 this methodology to develop verified implementations of
                 distributed protocols, demonstrating its
                 effectiveness.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Bastani:2018:ALP,
  author =       "Osbert Bastani and Rahul Sharma and Alex Aiken and
                 Percy Liang",
  title =        "Active learning of points-to specifications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "678--692",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192383",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When analyzing programs, large libraries pose
                 significant challenges to static points-to analysis. A
                 popular solution is to have a human analyst provide
                 points-to specifications that summarize relevant
                 behaviors of library code, which can substantially
                 improve precision and handle missing code such as
                 native code. We propose Atlas, a tool that
                 automatically infers points-to specifications. Atlas
                 synthesizes unit tests that exercise the library code,
                 and then infers points-to specifications based on
                 observations from these executions. Atlas automatically
                 infers specifications for the Java standard library,
                 and produces better results for a client static
                 information flow analysis on a benchmark of 46 Android
                 apps compared to using existing handwritten
                 specifications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Shi:2018:PFP,
  author =       "Qingkai Shi and Xiao Xiao and Rongxin Wu and Jinguo
                 Zhou and Gang Fan and Charles Zhang",
  title =        "{Pinpoint}: fast and precise sparse value flow
                 analysis for million lines of code",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "693--706",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192418",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "When dealing with millions of lines of code, we still
                 cannot have the cake and eat it: sparse value-flow
                 analysis is powerful in checking source-sink problems,
                 but existing work cannot escape from the ``pointer
                 trap'' --- a precise points-to analysis limits its
                 scalability and an imprecise one seriously undermines
                 its precision. We present Pinpoint, a holistic approach
                 that decomposes the cost of high-precision points-to
                 analysis by precisely discovering local data dependence
                 and delaying the expensive inter-procedural analysis
                 through memorization. Such memorization enables the
                 on-demand slicing of only the necessary
                 inter-procedural data dependence and path feasibility
                 queries, which are then solved by a costly SMT solver.
                 Experiments show that Pinpoint can check programs such
                 as MySQL (around 2 million lines of code) within 1.5
                 hours. The overall false positive rate is also very low
                 (14.3\% --- 23.6\%). Pinpoint has discovered over forty
                 real bugs in mature and extensively checked open source
                 systems. And the implementation of Pinpoint and all
                 experimental results are freely available.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Zhu:2018:DDC,
  author =       "He Zhu and Stephen Magill and Suresh Jagannathan",
  title =        "A data-driven {CHC} solver",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "707--721",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192416",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a data-driven technique to solve
                 Constrained Horn Clauses (CHCs) that encode
                 verification conditions of programs containing
                 unconstrained loops and recursions. Our CHC solver
                 neither constrains the search space from which a
                 predicate's components are inferred (e.g., by
                 constraining the number of variables or the values of
                 coefficients used to specify an invariant), nor fixes
                 the shape of the predicate itself (e.g., by bounding
                 the number and kind of logical connectives). Instead,
                 our approach is based on a novel machine
                 learning-inspired tool chain that synthesizes CHC
                 solutions in terms of arbitrary Boolean combinations of
                 unrestricted atomic predicates. A CEGAR-based
                 verification loop inside the solver progressively
                 samples representative positive and negative data from
                 recursive CHCs, which is fed to the machine learning
                 tool chain. Our solver is implemented as an LLVM pass
                 in the SeaHorn verification framework and has been used
                 to successfully verify a large number of nontrivial and
                 challenging C programs from the literature and
                 well-known benchmark suites (e.g., SV-COMP).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Raghothaman:2018:UGP,
  author =       "Mukund Raghothaman and Sulekha Kulkarni and Kihong Heo
                 and Mayur Naik",
  title =        "User-guided program reasoning using {Bayesian}
                 inference",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "722--735",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192417",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Program analyses necessarily make approximations that
                 often lead them to report true alarms interspersed with
                 many false alarms. We propose a new approach to
                 leverage user feedback to guide program analyses
                 towards true alarms and away from false alarms. Our
                 approach associates each alarm with a confidence value
                 by performing Bayesian inference on a probabilistic
                 model derived from the analysis rules. In each
                 iteration, the user inspects the alarm with the highest
                 confidence and labels its ground truth, and the
                 approach recomputes the confidences of the remaining
                 alarms given this feedback. It thereby maximizes the
                 return on the effort by the user in inspecting each
                 alarm. We have implemented our approach in a tool named
                 Bingo for program analyses expressed in Datalog.
                 Experiments with real users and two sophisticated
                 analyses---a static datarace analysis for Java programs
                 and a static taint analysis for Android apps---show
                 significant improvements on a range of metrics,
                 including false alarm rates and number of bugs found.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Hong:2018:GCO,
  author =       "Changwan Hong and Aravind Sukumaran-Rajam and Jinsung
                 Kim and Prashant Singh Rawat and Sriram Krishnamoorthy
                 and Louis-No{\"e}l Pouchet and Fabrice Rastello and P.
                 Sadayappan",
  title =        "{GPU} code optimization using abstract kernel
                 emulation and sensitivity analysis",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "736--751",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192397",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In this paper, we develop an approach to GPU kernel
                 optimization by focusing on identification of
                 bottleneck resources and determining optimization
                 parameters that can alleviate the bottleneck.
                 Performance modeling for GPUs is done by abstract
                 kernel emulation along with latency/gap modeling of
                 resources. Sensitivity analysis with respect to
                 resource latency/gap parameters is used to predict the
                 bottleneck resource for a given kernel's execution. The
                 utility of the bottleneck analysis is demonstrated in
                 two contexts: (1) Coupling the new bottleneck-driven
                 optimization strategy with the OpenTuner auto-tuner:
                 experimental results on all kernels from the Rodinia
                 suite and GPU tensor contraction kernels from the
                 NWChem computational chemistry suite demonstrate
                 effectiveness. (2) Manual code optimization: two case
                 studies illustrate the use of the bottleneck analysis
                 to iteratively improve the performance of code from
                 state-of-the-art domain-specific code generators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Dathathri:2018:GCO,
  author =       "Roshan Dathathri and Gurbinder Gill and Loc Hoang and
                 Hoang-Vu Dang and Alex Brooks and Nikoli Dryden and
                 Marc Snir and Keshav Pingali",
  title =        "{Gluon}: a communication-optimizing substrate for
                 distributed heterogeneous graph analytics",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "752--768",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192404",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper introduces a new approach to building
                 distributed-memory graph analytics systems that
                 exploits heterogeneity in processor types (CPU and
                 GPU), partitioning policies, and programming models.
                 The key to this approach is Gluon, a
                 communication-optimizing substrate. Programmers write
                 applications in a shared-memory programming system of
                 their choice and interface these applications with
                 Gluon using a lightweight API. Gluon enables these
                 programs to run on heterogeneous clusters and optimizes
                 communication in a novel way by exploiting structural
                 and temporal invariants of graph partitioning policies.
                 To demonstrate Gluon's ability to support different
                 programming models, we interfaced Gluon with the Galois
                 and Ligra shared-memory graph analytics systems to
                 produce distributed-memory versions of these systems
                 named D-Galois and D-Ligra, respectively. To
                 demonstrate Gluon's ability to support heterogeneous
                 processors, we interfaced Gluon with IrGL, a
                 state-of-the-art single-GPU system for graph analytics,
                 to produce D-IrGL, the first multi-GPU
                 distributed-memory graph analytics system. Our
                 experiments were done on CPU clusters with up to 256
                 hosts and roughly 70,000 threads and on multi-GPU
                 clusters with up to 64 GPUs. The communication
                 optimizations in Gluon improve end-to-end application
                 execution time by ~2.6$ \times $ on the average.
                 D-Galois and D-IrGL scale well and are faster than
                 Gemini, the state-of-the-art distributed CPU graph
                 analytics system, by factors of ~3.9$ \times $ and
                 ~4.9$ \times $, respectively, on the average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Acar:2018:HSP,
  author =       "Umut A. Acar and Arthur Chargu{\'e}raud and Adrien
                 Guatto and Mike Rainey and Filip Sieczkowski",
  title =        "Heartbeat scheduling: provable efficiency for nested
                 parallelism",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "769--782",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192391",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "A classic problem in parallel computing is to take a
                 high-level parallel program written, for example, in
                 nested-parallel style with fork-join constructs and run
                 it efficiently on a real machine. The problem could be
                 considered solved in theory, but not in practice,
                 because the overheads of creating and managing parallel
                 threads can overwhelm their benefits. Developing
                 efficient parallel codes therefore usually requires
                 extensive tuning and optimizations to reduce
                 parallelism just to a point where the overheads become
                 acceptable. In this paper, we present a scheduling
                 technique that delivers provably efficient results for
                 arbitrary nested-parallel programs, without the tuning
                 needed for controlling parallelism overheads. The basic
                 idea behind our technique is to create threads only at
                 a beat (which we refer to as the ``heartbeat'') and
                 make sure to do useful work in between. We specify our
                 heartbeat scheduler using an abstract-machine semantics
                 and provide mechanized proofs that the scheduler
                 guarantees low overheads for all nested parallel
                 programs. We present a prototype C++ implementation and
                 an evaluation that shows that Heartbeat competes well
                 with manually optimized Cilk Plus codes, without
                 requiring manual tuning.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Serrano:2018:GIP,
  author =       "Alejandro Serrano and Jurriaan Hage and Dimitrios
                 Vytiniotis and Simon Peyton Jones",
  title =        "Guarded impredicative polymorphism",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "783--796",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192389",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The design space for type systems that support
                 impredicative instantiation is extremely complicated.
                 One needs to strike a balance between expressiveness,
                 simplicity for both the end programmer and the type
                 system implementor, and how easily the system can be
                 integrated with other advanced type system concepts. In
                 this paper, we propose a new point in the design space,
                 which we call guarded impredicativity. Its key idea is
                 that impredicative instantiation in an application is
                 allowed for type variables that occur under a type
                 constructor. The resulting type system has a clean
                 declarative specification --- making it easy for
                 programmers to predict what will type and what will not
                 -, allows for a smooth integration with GHC's
                 OutsideIn(X) constraint solving framework, while giving
                 up very little in terms of expressiveness compared to
                 systems like HMF, HML, FPH and MLF. We give a sound and
                 complete inference algorithm, and prove a principal
                 type property for our system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Bowman:2018:TCC,
  author =       "William J. Bowman and Amal Ahmed",
  title =        "Typed closure conversion for the calculus of
                 constructions",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "797--811",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192372",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dependently typed languages such as Coq are used to
                 specify and verify the full functional correctness of
                 source programs. Type-preserving compilation can be
                 used to preserve these specifications and proofs of
                 correctness through compilation into the generated
                 target-language programs. Unfortunately,
                 type-preserving compilation of dependent types is hard.
                 In essence, the problem is that dependent type systems
                 are designed around high-level compositional
                 abstractions to decide type checking, but compilation
                 interferes with the type-system rules for reasoning
                 about run-time terms. We develop a type-preserving
                 closure-conversion translation from the Calculus of
                 Constructions (CC) with strong dependent pairs ($
                 \Sigma $ types) --- a subset of the core language of
                 Coq --- to a type-safe, dependently typed compiler
                 intermediate language named CC-CC. The central
                 challenge in this work is how to translate the source
                 type-system rules for reasoning about functions into
                 target type-system rules for reasoning about closures.
                 To justify these rules, we prove soundness of CC-CC by
                 giving a model in CC. In addition to type preservation,
                 we prove correctness of separate compilation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Pombrio:2018:ITR,
  author =       "Justin Pombrio and Shriram Krishnamurthi",
  title =        "Inferring type rules for syntactic sugar",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "4",
  pages =        "812--825",
  month =        apr,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3296979.3192398",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:57 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Type systems and syntactic sugar are both valuable to
                 programmers, but sometimes at odds. While sugar is a
                 valuable mechanism for implementing realistic
                 languages, the expansion process obscures program
                 source structure. As a result, type errors can
                 reference terms the programmers did not write (and even
                 constructs they do not know), baffling them. The
                 language developer must also manually construct type
                 rules for the sugars, to give a typed account of the
                 surface language. We address these problems by
                 presenting a process for automatically reconstructing
                 type rules for the surface language using rules for the
                 core. We have implemented this theory, and show several
                 interesting case studies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "PLDI '18 proceedings.",
}

@Article{Byma:2018:DHP,
  author =       "Stuart Byma and James R. Larus",
  title =        "Detailed heap profiling",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "1--13",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210564",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Modern software systems heavily use the memory heap.
                 As systems grow more complex and compute with
                 increasing amounts of data, it can be difficult for
                 developers to understand how their programs actually
                 use the bytes that they allocate on the heap and
                 whether improvements are possible. To answer this
                 question of heap usage efficiency, we have built a new,
                 detailed heap profiler called Memoro. Memoro uses a
                 combination of static instrumentation, subroutine
                 interception, and runtime data collection to build a
                 clear picture of exactly when and where a program
                 performs heap allocation, and crucially how it actually
                 uses that memory. Memoro also introduces a new
                 visualization application that can distill collected
                 data into scores and visual cues that allow developers
                 to quickly pinpoint and eliminate inefficient heap
                 usage in their software. Our evaluation and experience
                 with several applications demonstrates that Memoro can
                 reduce heap usage and produce runtime improvements of
                 10\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Tripp:2018:FHP,
  author =       "Charles Tripp and David Hyde and Benjamin
                 Grossman-Ponemon",
  title =        "{FRC}: a high-performance concurrent parallel deferred
                 reference counter for {C++}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "14--28",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210569",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present FRC, a high-performance concurrent parallel
                 reference counter for unmanaged languages. It is well
                 known that high-performance garbage collectors help
                 developers write memory-safe, highly concurrent systems
                 and data structures. While C++, C, and other unmanaged
                 languages are used in high-performance applications,
                 adding concurrent memory management to these languages
                 has proven to be difficult. Unmanaged languages like
                 C++ use pointers instead of references, and have
                 uncooperative mutators which do not pause easily at a
                 safe point. Thus, scanning mutator stack root
                 references is challenging. FRC only defers decrements
                 and does not require mutator threads to pause during
                 collection. By deferring only decrements, FRC avoids
                 much of the synchronization overhead of a
                 fully-deferred implementation. Root references are
                 scanned without interrupting the mutator by publishing
                 these references to a thread-local array. FRC's
                 performance can exceed that of the C++ standard
                 library's shared pointer by orders of magnitude. FRC's
                 thread-safety guarantees and low synchronization
                 overhead enable significant throughput gains for
                 concurrently-readable shared data structures. We
                 describe the components of FRC, including our static
                 tree router data structure: a novel barrier which
                 improves the scalability of parallel collection
                 workers. FRC's performance is evaluated on several
                 concurrent data structures. We release FRC and our
                 tests as open-source code and expect FRC will be useful
                 for many concurrent C++ software systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Brandt:2018:DGC,
  author =       "Steven R. Brandt and Hari Krishnan and Costas Busch
                 and Gokarna Sharma",
  title =        "Distributed garbage collection for general graphs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "29--44",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210572",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose a scalable, cycle-collecting,
                 decentralized, reference counting garbage collector
                 with partial tracing. The algorithm is based on the
                 Brownbridge system but uses four different types of
                 references to label edges. Memory usage is O (log n)
                 bits per node, where n is the number of nodes in the
                 graph. The algorithm assumes an asynchronous network
                 model with a reliable reordering channel. It collects
                 garbage in O (E a ) time, where E a is the number of
                 edges in the induced subgraph. The algorithm uses
                 termination detection to manage the distributed
                 computation, a unique identifier to break the symmetry
                 among multiple collectors, and a transaction-based
                 approach when multiple collectors conflict. Unlike
                 existing algorithms, ours is not centralized, does not
                 require barriers, does not require migration of nodes,
                 does not require back-pointers on every edge, and is
                 stable against concurrent mutation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Ismail:2018:HSC,
  author =       "Mohamed Ismail and G. Edward Suh",
  title =        "Hardware-software co-optimization of memory management
                 in dynamic languages",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "45--58",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210566",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic programming languages are becoming
                 increasingly popular, yet often show a significant
                 performance slowdown compared to static languages. In
                 this paper, we study the performance overhead of
                 automatic memory management in dynamic languages. We
                 propose to improve the performance and memory bandwidth
                 usage of dynamic languages by co-optimizing garbage
                 collection overhead and cache performance for
                 newly-initialized and dead objects. Our study shows
                 that less frequent garbage collection results in a
                 large number of cache misses for initial stores to new
                 objects. We solve this problem by directly placing
                 uninitialized objects into on-chip caches without
                 off-chip memory accesses. We further optimize the
                 garbage collection by reducing unnecessary cache
                 pollution and write-backs through partial tracing that
                 invalidates dead objects between full garbage
                 collections. Experimental results on PyPy and V8 show
                 that less frequent garbage collection along with our
                 optimizations can significantly improve the performance
                 of dynamic languages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Bruno:2018:DVM,
  author =       "Rodrigo Bruno and Paulo Ferreira and Ruslan Synytsky
                 and Tetiana Fydorenchyk and Jia Rao and Hang Huang and
                 Song Wu",
  title =        "Dynamic vertical memory scalability for {OpenJDK}
                 cloud applications",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "59--70",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210567",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The cloud is an increasingly popular platform to
                 deploy applications as it lets cloud users to provide
                 resources to their applications as needed. Furthermore,
                 cloud providers are now starting to offer a
                 ``pay-as-you-use'' model in which users are only
                 charged for the resources that are really used instead
                 of paying for a statically sized instance. This new
                 model allows cloud users to save money, and cloud
                 providers to better utilize their hardware. However,
                 applications running on top of runtime environments
                 such as the Java Virtual Machine (JVM) cannot benefit
                 from this new model because they cannot dynamically
                 adapt the amount of used resources at runtime. In
                 particular, if an application needs more memory than
                 what was initially predicted at launch time, the JVM
                 will not allow the application to grow its memory
                 beyond the maximum value defined at launch time. In
                 addition, the JVM will hold memory that is no longer
                 being used by the application. This lack of dynamic
                 vertical scalability completely prevents the benefits
                 of the ``pay-as-you-use'' model, and forces users to
                 over-provision resources, and to lose money on unused
                 resources. We propose a new JVM heap sizing strategy
                 that allows the JVM to dynamically scale its memory
                 utilization according to the application's needs.
                 First, we provide a configurable limit on how much the
                 application can grow its memory. This limit is dynamic
                 and can be changed at runtime, as opposed to the
                 current static limit that can only be set at launch
                 time. Second, we adapt current Garbage Collection
                 policies that control how much the heap can grow and
                 shrink to better fit what is currently being used by
                 the application. The proposed solution is implemented
                 in the OpenJDK 9 HotSpot JVM, the new release of
                 OpenJDK. Changes were also introduced inside the
                 Parallel Scavenge collector and the Garbage First
                 collector (the new by-default collector in HotSpot).
                 Evaluation experiments using real workloads and data
                 show that, with negligible throughput and memory
                 overhead, dynamic vertical memory scalability can be
                 achieved. This allows users to save significant amounts
                 of money by not paying for unused resources, and cloud
                 providers to better utilize their physical machines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Kaur:2018:OCM,
  author =       "Gurneet Kaur and Keval Vora and Sai Charan Koduru and
                 Rajiv Gupta",
  title =        "{OMR}: out-of-core {MapReduce} for large data sets",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "71--83",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210568",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "While single machine MapReduce systems can squeeze out
                 maximum performance from available multi-cores, they
                 are often limited by the size of main memory and can
                 thus only process small datasets. Our experience shows
                 that the state-of-the-art single-machine in-memory
                 MapReduce system Metis frequently experiences
                 out-of-memory crashes. Even though today's computers
                 are equipped with efficient secondary storage devices,
                 the frameworks do not utilize these devices mainly
                 because disk access latencies are much higher than
                 those for main memory. Therefore, the single-machine
                 setup of the Hadoop system performs much slower when it
                 is presented with the datasets which are larger than
                 the main memory. Moreover, such frameworks also require
                 tuning a lot of parameters which puts an added burden
                 on the programmer. In this paper we present OMR, an
                 Out-of-core MapReduce system that not only successfully
                 handles datasets that are far larger than the size of
                 main memory, it also guarantees linear scaling with the
                 growing data sizes. OMR actively minimizes the amount
                 of data to be read/written to/from disk via on-the-fly
                 aggregation and it uses block sequential disk
                 read/write operations whenever disk accesses become
                 necessary to avoid running out of memory. We
                 theoretically prove OMR's linear scalability and
                 empirically demonstrate it by processing datasets that
                 are up to 5x larger than main memory. Our experiments
                 show that in comparison to the standalone
                 single-machine setup of the Hadoop system, OMR delivers
                 far higher performance. Also in contrast to Metis, OMR
                 avoids out-of-memory crashes for large datasets as well
                 as delivers higher performance when datasets are small
                 enough to fit in main memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Byrne:2018:MMR,
  author =       "Daniel Byrne and Nilufer Onder and Zhenlin Wang",
  title =        "{mPart}: miss-ratio curve guided partitioning in
                 key--value stores",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "84--95",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210571",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Web applications employ key-value stores to cache the
                 data that is most commonly accessed. The cache improves
                 an web application's performance by serving its
                 requests from memory, avoiding fetching them from the
                 backend database. Since the memory space is limited,
                 maximizing the memory utilization is a key to
                 delivering the best performance possible. This has lead
                 to the use of multi-tenant systems, allowing
                 applications to share cache space. In addition,
                 application data access patterns change over time, so
                 the system should be adaptive in its memory allocation.
                 In this work, we address both multi-tenancy (where a
                 single cache is used for multiple applications) and
                 dynamic workloads (changing access patterns) using a
                 model that relates the cache size to the application
                 miss ratio, known as a miss ratio curve. Intuitively,
                 the larger the cache, the less likely the system will
                 need to fetch the data from the database. Our
                 efficient, online construction of the miss ratio curve
                 allows us to determine a near optimal memory allocation
                 given the available system memory, while adapting to
                 changing data access patterns. We show that our model
                 outperforms an existing state-of-the-art sharing model,
                 Memshare, in terms of overall cache hit ratio and does
                 so at a lower time cost. We show that for a typical
                 system, overall hit ratio is consistently 1 percentage
                 point greater and 99.9th percentile latency is reduced
                 by as much as 2.9\% under standard web application
                 workloads containing millions of requests.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Brock:2018:PBS,
  author =       "Jacob Brock and Chen Ding and Rahman Lavaee and
                 Fangzhou Liu and Liang Yuan",
  title =        "Prediction and bounds on shared cache demand from
                 memory access interleaving",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "96--108",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210565",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Cache in multicore machines is often shared, and the
                 cache performance depends on how memory accesses
                 belonging to different programs interleave with one
                 another. The full range of performance possibilities
                 includes all possible interleavings, which are too
                 numerous to be studied by experiments for any mix of
                 non-trivial programs. This paper presents a theory to
                 characterize the effect of memory access interleaving
                 due to parallel execution of non-data-sharing programs.
                 The theory uses an established metric called the
                 footprint (which can be used to calculate miss ratios
                 in fully-associative LRU caches) to measure cache
                 demand, and considers the full range of interleaving
                 possibilities. The paper proves a lower bound for
                 footprints of interleaved traces, and then formulates
                 an upper bound in terms of the footprints of the
                 constituent traces. It also shows the correctness of
                 footprint composition used in a number of existing
                 techniques, and places precise bounds on its
                 accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Horie:2018:BDQ,
  author =       "Michihiro Horie and Hiroshi Horii and Kazunori Ogata
                 and Tamiya Onodera",
  title =        "Balanced double queues for {GC} work-stealing on weak
                 memory models",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "5",
  pages =        "109--119",
  month =        may,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299706.3210570",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Work-stealing is promising for scheduling and
                 balancing parallel workloads. It has a wide range of
                 applicability on middleware, libraries, and runtime
                 systems of programming languages. OpenJDK uses
                 work-stealing for copying garbage collection (GC) to
                 balance copying tasks among GC threads. Each thread has
                 its own queue to store tasks. When a thread has no task
                 in its queue, it acts as a thief and attempts to steal
                 a task from another thread's queue. However, this
                 work-stealing algorithm requires expensive memory
                 fences for pushing, popping, and stealing tasks,
                 especially on weak memory models such as POWER and ARM.
                 To address this problem, we propose a work-stealing
                 algorithm that uses double queues. Each GC thread has a
                 public queue that is accessible from other GC threads
                 and a private queue that is only accessible by itself.
                 Pushing and popping tasks in the private queue are free
                 from expensive memory fences. The most significant
                 point in our algorithm is providing a mechanism to
                 maintain the load balance on the basis of the use of
                 double queues. We developed a prototype implementation
                 for parallel GC in OpenJDK8 for ppc64le. We evaluated
                 our algorithm by using SPECjbb2015, SPECjvm2008,
                 TPC-DS, and Apache DayTrader.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "ISMM '18 proceedings.",
}

@Article{Santos:2018:MBD,
  author =       "Rodrigo C. M. Santos and Guilherme F. Lima and
                 Francisco Sant'Anna and Roberto Ierusalimschy and
                 Edward H. Haeusler",
  title =        "A memory-bounded, deterministic and terminating
                 semantics for the synchronous programming language
                 {C{\'e}u}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "1--18",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211334",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "C{\'e}u is a synchronous programming language for
                 embedded soft real-time systems. It focuses on
                 control-flow safety features, such as safe
                 shared-memory concurrency and safe abortion of lines of
                 execution, while enforcing memory bounded,
                 deterministic, and terminating reactions to the
                 environment. In this work, we present a small-step
                 structural operational semantics for C{\'e}u and a
                 proof that reactions have the properties enumerated
                 above: that for a given arbitrary timeline of input
                 events, multiple executions of the same program always
                 react in bounded time and arrive at the same final
                 finite memory state.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Devine:2018:MCI,
  author =       "James Devine and Joe Finney and Peli de Halleux and
                 Micha{\l} Moskal and Thomas Ball and Steve Hodges",
  title =        "{MakeCode} and {CODAL}: intuitive and efficient
                 embedded systems programming for education",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "19--30",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211335",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Across the globe, it is now commonplace for educators
                 to engage in the making (design and development) of
                 embedded systems in the classroom to motivate and
                 excite their students. This new domain brings its own
                 set of unique requirements. Historically, embedded
                 systems development requires knowledge of low-level
                 programming languages, local installation of
                 compilation toolchains, device drivers, and
                 applications. For students and educators, these
                 requirements can introduce insurmountable barriers. We
                 present the motivation, requirements, implementation,
                 and evaluation of a new programming platform that
                 enables novice users to create software for embedded
                 systems. The platform has two major components: (1)
                 Microsoft MakeCode ( www.makecode.com ), a web app that
                 encapsulates an entire beginner IDE for
                 microcontrollers; and (2) CODAL, an efficient
                 component-oriented C++ runtime for microcontrollers. We
                 show how MakeCode and CODAL provide an accessible,
                 cross-platform, installation-free programming
                 experience for the BBC micro:bit and other embedded
                 devices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Taylor:2018:ADL,
  author =       "Ben Taylor and Vicent Sanz Marco and Willy Wolff and
                 Yehia Elkhatib and Zheng Wang",
  title =        "Adaptive deep learning model selection on embedded
                 systems",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "31--43",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211336",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The recent ground-breaking advances in deep learning
                 networks (DNNs) make them attractive for embedded
                 systems. However, it can take a long time for DNNs to
                 make an inference on resource-limited embedded devices.
                 Offloading the computation into the cloud is often
                 infeasible due to privacy concerns, high latency, or
                 the lack of connectivity. As such, there is a critical
                 need to find a way to effectively execute the DNN
                 models locally on the devices. This paper presents an
                 adaptive scheme to determine which DNN model to use for
                 a given input, by considering the desired accuracy and
                 inference time. Our approach employs machine learning
                 to develop a predictive model to quickly select a
                 pre-trained DNN to use for a given input and the
                 optimization constraint. We achieve this by first
                 training off-line a predictive model, and then use the
                 learnt model to select a DNN model to use for new,
                 unseen inputs. We apply our approach to the image
                 classification task and evaluate it on a Jetson TX2
                 embedded deep learning platform using the ImageNet
                 ILSVRC 2012 validation dataset. We consider a range of
                 influential DNN models. Experimental results show that
                 our approach achieves a 7.52\% improvement in inference
                 accuracy, and a 1.8x reduction in inference time over
                 the most-capable single DNN model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Han:2018:ORS,
  author =       "Lei Han and Zhaoyan Shen and Zili Shao and Tao Li",
  title =        "Optimizing {RAID\slash SSD} controllers with lifetime
                 extension for flash-based {SSD} array",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "44--54",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211338",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Flash-based SSD RAID arrays are increasingly being
                 deployed in data centers. Compared with HDD arrays, SSD
                 arrays drastically enhance storage density and I/O
                 performance, and reduce power and rack space.
                 Nevertheless, SSDs suffer aging issues. Though prior
                 studies have been conducted to address this
                 disadvantage, effective techniques of RAID/SSD
                 controllers are urgently needed to extend the lifetime
                 of SSD arrays. In this paper, we for the first time
                 apply approximate storage via the interplay of RAID and
                 SSD controllers to optimize the lifespan of SSD arrays.
                 Our basic idea is to reuse faulty blocks (those contain
                 pages with uncorrectable errors) to store approximate
                 data (which can tolerate more errors). By relaxing the
                 integrity of flash blocks, we observed that the
                 endurance of NAND flash memory can be significantly
                 boosted, thereby providing huge potentials to
                 significantly extend the lifetime of SSDs. Based on
                 this observation, we propose the use of an efficient
                 space management scheme for data allocation and FTL
                 strategies by coordinating the interplay of RAID and
                 SSD controllers to optimize the lifetime of SSD arrays.
                 We implemented a prototype, called FreeRAID, based on
                 an SSD array simulator. Our experiments show that we
                 can significantly increase the lifetime by up to 2.17$
                 \times $ compared with conventional SSD-based RAID
                 arrays.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Skelin:2018:CSA,
  author =       "Mladen Skelin and Marc Geilen",
  title =        "Compositionality in scenario-aware dataflow: a
                 rendezvous perspective",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "55--64",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211339",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Finite-state machine-based scenario-aware dataflow
                 (FSM-SADF) is a dynamic dataflow model of computation
                 that combines streaming data and finite-state control.
                 For the most part, it preserves the determinism of its
                 underlying synchronous dataflow (SDF) concurrency model
                 and only when necessary introduces the
                 non-deterministic variation in terms of scenarios that
                 are represented by SDF graphs. This puts FSM-SADF in a
                 sweet spot in the trade-off space between
                 expressiveness and analyzability. However, FSM-SADF
                 supports no notion of compositionality, which hampers
                 its usability in modeling and consequent analysis of
                 large systems. In this work we propose a compositional
                 semantics for FSM-SADF that overcomes this problem. We
                 base the semantics of the composition on standard
                 composition of processes with rendezvous communication
                 in the style of CCS or CSP at the control level and the
                 parallel, serial and feedback composition of SDF graphs
                 at the dataflow level. We evaluate the approach on a
                 case study from the multimedia domain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Stokes:2018:DAG,
  author =       "Michael Stokes and Ryan Baird and Zhaoxiang Jin and
                 David Whalley and Soner Onder",
  title =        "Decoupling address generation from loads and stores to
                 improve data access energy efficiency",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "65--75",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211340",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Level-one data cache (L1 DC) accesses impact energy
                 usage as they frequently occur and use significantly
                 more energy than register file accesses. A memory
                 access instruction consists of an address generation
                 operation calculating the location where the data item
                 resides in memory and the data access operation that
                 loads/stores a value from/to that location. We propose
                 to decouple these two operations into separate machine
                 instructions to reduce energy usage. By associating the
                 data translation lookaside buffer (DTLB) access and
                 level-one data cache (L1 DC) tag check with an address
                 generation instruction, only a single data array in a
                 set-associative L1 DC needs to be accessed during a
                 load instruction when the result of the tag check is
                 known at that point. In addition, many DTLB accesses
                 and L1 DC tag checks are avoided by memoizing the DTLB
                 way and L1 DC way with the register that holds the
                 memory address to be dereferenced. Finally, we are able
                 to often coalesce an ALU operation with a load or store
                 data access using our technique to reduce the number of
                 instructions executed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Egger:2018:VCG,
  author =       "Bernhard Egger and Eunjin Song and Hochan Lee and
                 Daeyoung Shin",
  title =        "Verification of coarse-grained reconfigurable arrays
                 through random test programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "76--88",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211342",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We propose and evaluate a framework to test the
                 functional correctness of coarse-grained reconfigurable
                 array (CGRA) processors for pre-silicon verification
                 and post-silicon validation. To reflect the
                 reconfigurable nature of CGRAs, an architectural model
                 of the system under test is built directly from the
                 hardware description files. A guided place-and-routing
                 algorithm is used to map operations and operands onto
                 the heterogeneous processing elements (PE). Test
                 coverage is maximized by favoring unexercised parts of
                 the architecture. Requiring no explicit knowledge about
                 the semantics of operations, the random test program
                 generator (RTPG) framework seamlessly supports custom
                 ISA extensions. The proposed framework is applied to
                 the Samsung Reconfigurable Processor, a
                 modulo-scheduled CGRA integrated in smartphones,
                 cameras, printers, and smart TVs. Experiments
                 demonstrate that the RTPG is versatile, efficient, and
                 quickly achieves a high coverage. In addition to
                 detecting all randomly inserted faults, the generated
                 test programs also exposed two yet unknown actual
                 faults in the architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Chang:2018:DNN,
  author =       "Andre Xian Ming Chang and Aliasger Zaidy and Lukasz
                 Burzawa and Eugenio Culurciello",
  title =        "Deep neural networks compiler for a trace-based
                 accelerator (short {WIP} paper)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "89--93",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211333",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Deep Neural Networks (DNNs) are the algorithm of
                 choice for image processing applications. DNNs present
                 highly parallel workloads that lead to the emergence of
                 custom hardware accelerators. Deep Learning (DL) models
                 specialized in different tasks require a programmable
                 custom hardware and a compiler/mapper to efficiently
                 translate different DNNs into an efficient dataflow in
                 the accelerator. The goal of this paper is to present a
                 compiler for running DNNs on Snowflake, which is a
                 programmable hardware accelerator that targets DNNs.
                 The compiler correctly generates instructions for
                 various DL models: AlexNet, VGG, ResNet and LightCNN9.
                 Snowflake, with a varying number of processing units,
                 was implemented on FPGA to measure the compiler and
                 Snowflake performance properties upon scaling up. The
                 system achieves 70 frames/s and 4.5 GB/s of off-chip
                 memory bandwidth for AlexNet without linear layers on
                 Xilinx's Zynq-SoC XC7Z045 FPGA.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{SantAnna:2018:TSL,
  author =       "Francisco Sant'Anna and Alexandre Sztajnberg and Ana
                 L{\'u}cia de Moura and Noemi Rodrigues",
  title =        "Transparent standby for low-power,
                 resource-constrained embedded systems: a programming
                 language-based approach (short {WIP} paper)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "94--98",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211337",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Standby efficiency for connected devices is one of the
                 priorities of the G20's Energy Efficiency Action Plan.
                 We propose transparent programming language mechanisms
                 to enforce that applications remain in the deepest
                 standby modes for the longest periods of time. We
                 extend the programming language C{\'e}u with support
                 for interrupt service routines and with a simple power
                 management runtime. Based on these primitives, we also
                 provide device drivers that allow applications to take
                 advantage of standby automatically. Our approach relies
                 on the synchronous semantics of the language which
                 guarantees that reactions to the environment always
                 reach an idle state amenable to standby. In addition,
                 in order to lower the programming barrier of adoption,
                 we show that programs in C{\'e}u can keep a sequential
                 syntactic structure, even when applications require
                 non-trivial concurrent behavior.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Chimdyalwar:2018:SRP,
  author =       "Bharti Chimdyalwar and Priyanka Darke",
  title =        "Statically relating program properties for efficient
                 verification (short {WIP} paper)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "99--103",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211341",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Efficient automatic verification of real world
                 embedded software with numerous properties is a
                 challenge. Existing techniques verify a sufficient
                 subset of properties by identifying implication
                 relations between their verification outcomes. We
                 believe this is expensive and propose a novel
                 complementary approach called grouping. Grouping does
                 not consider the verification outcomes but uses data
                 and control flow characteristics of the program to
                 create disjoint groups of properties verifiable one
                 group at a time.We present three grouping techniques, a
                 framework, and experiments over open source and
                 industrial applications to support our thesis. The
                 experiments show a high gain in performance of a few
                 state-of-the-art tools. This led to the integration of
                 grouping into the verification process of an automotive
                 software manufacturer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Chadha:2018:JAS,
  author =       "Gaurav Chadha",
  title =        "{JSCore}: architectural support for accelerating
                 {JavaScript} execution (short {WIP} paper)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "104--108",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211343",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "JavaScript has seen meteoric growth in popularity as
                 it has increasingly become the language of choice for
                 developers, both for front-end web development and
                 server code development through various JavaScript
                 frameworks and Node.js. Part of the reason for its wide
                 use is that it is a prototype based language with
                 dynamic types, making it easy to learn and program in.
                 This flexibility and ease of programming comes at the
                 cost of performance. There are two sources of
                 significant slowdown. First, since the number and type
                 of properties of prototypes is dynamic, accessing a
                 property involves a slow dictionary lookup, as opposed
                 to it being present at a fixed offset from the base
                 address. Second, the dynamism in type of values
                 necessitates wrapping and unwrapping of values into
                 objects with a variety of checks including for type of
                 the value. To mitigate these performance problems, this
                 paper proposes JSCore, a core specialized for
                 JavaScript execution, that vastly reduces the
                 performance degradation due to the above two causes. It
                 uses a hardware lookup table to accelerate property
                 access, and extends the data path to store data types
                 with the data, nearly eliminating the second source of
                 slowdown. Combining the two, JSCore accelerates real
                 world JavaScript applications by 23\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Mehrotra:2018:OSR,
  author =       "Pavan Mehrotra and Sabar Dasgupta and Samantha
                 Robertson and Paul Nuyujukian",
  title =        "An open-source realtime computational platform (short
                 {WIP} paper)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "6",
  pages =        "109--112",
  month =        jun,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299710.3211344",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:58 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Systems neuroscience studies involving in-vivo models
                 often require realtime data processing. In these
                 studies, many events must be monitored and processed
                 quickly, including behavior of the subject (e.g.,
                 movement of a limb) or features of neural data (e.g., a
                 neuron transmitting an action potential).
                 Unfortunately, most realtime platforms are proprietary,
                 require specific architectures, or are limited to
                 low-level programming languages. Here we present a
                 hardware-independent, open-source realtime computation
                 platform that supports high-level programming. The
                 resulting platform, LiCoRICE, can process on order
                 10e10 bits/sec of network data at 1 ms ticks with 18.2
                 \micro s jitter. It connects to various inputs and
                 outputs (e.g., DIO, Ethernet, database logging, and
                 analog line in/out) and minimizes reliance on custom
                 device drivers by leveraging peripheral support via the
                 Linux kernel. Its modular architecture supports
                 model-based design for rapid prototyping with C and
                 Python/Cython and can perform numerical operations via
                 BLAS/LAPACK-optimized NumPy that is statically compiled
                 via Numba's pycc. LiCoRICE is not only suitable for
                 systems neuroscience research, but also for
                 applications requiring closed-loop realtime data
                 processing from robotics and control systems to
                 interactive applications and quantitative financial
                 trading.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "LCTES '18 proceedings.",
}

@Article{Mista:2018:BPQ,
  author =       "Agust{\'\i}n Mista and Alejandro Russo and John
                 Hughes",
  title =        "Branching processes for {QuickCheck} generators",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "1--13",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242747",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "In QuickCheck (or, more generally, random testing), it
                 is challenging to control random data generators'
                 distributions---specially when it comes to user-defined
                 algebraic data types (ADT). In this paper, we adapt
                 results from an area of mathematics known as branching
                 processes, and show how they help to analytically
                 predict (at compile-time) the expected number of
                 generated constructors, even in the presence of
                 mutually recursive or composite ADTs. Using our
                 probabilistic formulas, we design heuristics capable of
                 automatically adjusting probabilities in order to
                 synthesize generators which distributions are aligned
                 with users' demands. We provide a Haskell
                 implementation of our mechanism in a tool called DRaGeN
                 and perform case studies with real-world applications.
                 When generating random values, our synthesized
                 QuickCheck generators show improvements in code
                 coverage when compared with those automatically derived
                 by state-of-the-art tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Breitner:2018:PCP,
  author =       "Joachim Breitner",
  title =        "A promise checked is a promise kept: inspection
                 testing",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "14--25",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242748",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Occasionally, developers need to ensure that the
                 compiler treats their code in a specific way that is
                 only visible by inspecting intermediate or final
                 compilation artifacts. This is particularly common with
                 carefully crafted compositional libraries, where
                 certain usage patterns are expected to trigger an
                 intricate sequence of compiler optimizations --- stream
                 fusion is a well-known example. The developer of such a
                 library has to manually inspect build artifacts and
                 check for the expected properties. Because this is too
                 tedious to do often, it will likely go unnoticed if the
                 property is broken by a change to the library code, its
                 dependencies or the compiler. The lack of automation
                 has led to released versions of such libraries breaking
                 their documented promises. This indicates that there is
                 an unrecognized need for a new testing paradigm,
                 inspection testing, where the programmer declaratively
                 describes non-functional properties of an compilation
                 artifact and the compiler checks these properties. We
                 define inspection testing abstractly, implement it in
                 the context of the Haskell Compiler GHC and show that
                 it increases the quality of such libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Handley:2018:ACT,
  author =       "Martin A. T. Handley and Graham Hutton",
  title =        "{AutoBench}: comparing the time performance of
                 {Haskell} programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "26--37",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242749",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Two fundamental goals in programming are correctness
                 (producing the right results) and efficiency (using as
                 few resources as possible). Property-based testing
                 tools such as QuickCheck provide a lightweight means to
                 check the correctness of Haskell programs, but what
                 about their efficiency? In this article, we show how
                 QuickCheck can be combined with the Criterion
                 benchmarking library to give a lightweight means to
                 compare the time performance of Haskell programs. We
                 present the design and implementation of the AutoBench
                 system, demonstrate its utility with a number of case
                 studies, and find that many QuickCheck correctness
                 properties are also efficiency improvements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Sun:2018:AMB,
  author =       "Marilyn Sun and Kathleen Fisher",
  title =        "{Autobahn 2.0}: minimizing bangs while maintaining
                 performance (system demonstration)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "38--40",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3264734",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Lazy evaluation has many advantages, but it can cause
                 bad performance. Consequently, Haskell allows users to
                 force eager evaluation at certain program points by
                 inserting strictness annotations, known and written as
                 bangs (!). Unfortunately, manual bang placement is
                 difficult. Autobahn 1.0 uses a genetic algorithm to
                 infer bang annotations that improve performance.
                 However, Autobahn 1.0 often generates large numbers of
                 superfluous bangs, which is problematic because users
                 must inspect each such bang to determine whether it is
                 safe. We introduce Autobahn 2.0, which uses GHC
                 profiling information to reduce the number of
                 superfluous bangs. When evaluated on the NoFib
                 benchmark suite, Autobahn 2.0 reduced the number of
                 inferred bangs by 90.2\% on average, while only
                 degrading program performance by 15.7\% compared with
                 the performance produced by Autobahn 1.0. In a case
                 study on a garbage collection simulator, Autobahn 2.0
                 eliminated 81.8\% of the recommended bangs, with the
                 same 15.7\% optimization degradation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Serrano:2018:GPA,
  author =       "Alejandro Serrano and Victor Cacciari Miraldo",
  title =        "Generic programming of all kinds",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "41--54",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242745",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Datatype-generic programming is a widely used
                 technique to define functions that work regularly over
                 a class of datatypes. Examples include deriving
                 serialization of data, equality or even functoriality.
                 The state-of-the-art of generic programming still lacks
                 handling GADTs, multiple type variables, and some other
                 features. This paper exploits modern GHC extensions,
                 including {\tt preTypeInType}, to handle arbitrary
                 number of type variables, constraints, and
                 existentials. We also provide an Agda model of our
                 construction that does not require Russel's paradox,
                 proving the construction is consistent.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Blondal:2018:DHT,
  author =       "Baldur Bl{\"o}ndal and Andres L{\"o}h and Ryan Scott",
  title =        "{Deriving Via}: or, how to turn hand-written instances
                 into an anti-pattern",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "55--67",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242746",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Haskell's deriving construct is a cheap and cheerful
                 way to quickly generate instances of type classes that
                 follow common patterns. But at present, there is only a
                 subset of such type class patterns that deriving
                 supports, and if a particular class lies outside of
                 this subset, then one cannot derive it at all, with no
                 alternative except for laboriously declaring the
                 instances by hand. To overcome this deficit, we
                 introduce Deriving Via, an extension to deriving that
                 enables programmers to compose instances from named
                 programming patterns, thereby turning deriving into a
                 high-level domain-specific language for defining
                 instances. Deriving Via leverages newtypes---an already
                 familiar tool of the Haskell trade---to declare
                 recurring patterns in a way that both feels natural and
                 allows a high degree of abstraction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Martinez:2018:ITR,
  author =       "Guido Mart{\'\i}nez and Mauro Jaskelioff and Guido {De
                 Luca}",
  title =        "Improving typeclass relations by being open",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "68--80",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242751",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Mathematical concepts such as monads, functors,
                 monoids, and semigroups are expressed in Haskell as
                 typeclasses. Therefore, in order to exploit relations
                 such as ``every monad is a functor'', and ``every
                 monoid is a semigroup'', we need to be able to also
                 express relations between typeclasses. Currently, the
                 only way to do so is using superclasses. However,
                 superclasses can be problematic due to their closed
                 nature. Adding a superclass implies modifying the
                 subclass' definition, which is either impossible if one
                 does not own such code, or painful as it requires
                 cascading changes and the introduction of boilerplate
                 throughout the codebase. In this article, we introduce
                 class morphisms, a way to relate classes in an open
                 fashion, without changing class definitions. We show
                 how class morphisms improve the expressivity,
                 conciseness, and maintainability of code. Further, we
                 show how to implement them while maintaining canonicity
                 and coherence, two key properties of the Haskell type
                 system. Extending a typechecker with class morphisms
                 amounts to adding an elaboration phase and is an
                 unintrusive change. We back this claim with a prototype
                 extension of GHC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Winant:2018:CED,
  author =       "Thomas Winant and Dominique Devriese",
  title =        "Coherent explicit dictionary application for
                 {Haskell}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "81--93",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242752",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Type classes are one of Haskell's most popular
                 features and extend its type system with ad-hoc
                 polymorphism. Since their conception, there were useful
                 features that could not be offered because of the
                 desire to offer two correctness properties: coherence
                 and global uniqueness of instances. Coherence
                 essentially guarantees that program semantics are
                 independent from type-checker internals. Global
                 uniqueness of instances is relied upon by libraries for
                 enforcing, for example, that a single order relation is
                 used for all manipulations of an ordered binary tree.
                 The features that could not be offered include explicit
                 dictionary application and local instances, which would
                 be highly useful in practice. In this paper, we propose
                 a new design for offering explicit dictionary
                 application, without compromising coherence and global
                 uniqueness. We introduce a novel criterion based on
                 GHC's type argument roles to decide when a dictionary
                 application is safe with respect to global uniqueness
                 of instances. We preserve coherence by detecting
                 potential sources of incoherence, and prove it
                 formally. Moreover, our solution makes it possible to
                 use local dictionaries. In addition to developing our
                 ideas formally, we have implemented a working prototype
                 in GHC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Eisenberg:2018:TVP,
  author =       "Richard A. Eisenberg and Joachim Breitner and Simon
                 Peyton Jones",
  title =        "Type variables in patterns",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "94--105",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242753",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "For many years, GHC has implemented an extension to
                 Haskell that allows type variables to be bound in type
                 signatures and patterns, and to scope over terms. This
                 extension was never properly specified. We rectify that
                 oversight here. With the formal specification in hand,
                 the otherwise-labyrinthine path toward a design for
                 binding type variables in patterns becomes blindingly
                 clear. We thus extend ScopedTypeVariables to bind type
                 variables explicitly, obviating the Proxy workaround to
                 the dustbin of history.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Otwani:2018:TPY,
  author =       "Divesh Otwani and Richard A. Eisenberg",
  title =        "The {Thoralf} plugin: for your fancy type needs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "106--118",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242754",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Many fancy types (e.g., generalized algebraic data
                 types, type families) require a type checker plugin.
                 These fancy types have a type index (e.g., type level
                 natural numbers) with an equality relation that is
                 difficult or impossible to represent using GHC's
                 built-in type equality. The most practical way to
                 represent these equality relations is through a plugin
                 that asserts equality constraints. However, such
                 plugins are difficult to write and reason about. In
                 this paper, we (1) present a formal theory of reasoning
                 about the correctness of type checker plugins for type
                 indices, and, (2) apply this theory in creating
                 Thoralf, a generic and extensible plugin for type
                 indices that translates GHC constraint problems to
                 queries to an external SMT solver. By ``generic and
                 extensible'', we mean the restrictions on extending
                 Thoralf are slight, and, if some type index could be
                 encoded as an SMT sort, then a programmer could extend
                 Thoralf by providing this encoding function.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Noonan:2018:GDP,
  author =       "Matt Noonan",
  title =        "Ghosts of departed proofs (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "119--131",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242755",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Library authors often are faced with a design choice:
                 should a function with preconditions be implemented as
                 a partial function, or by returning a failure condition
                 on incorrect use? Neither option is ideal. Partial
                 functions lead to frustrating run-time errors. Failure
                 conditions must be checked at the use-site, placing an
                 unfair tax on the users who have ensured that the
                 function's preconditions were correctly met. In this
                 paper, we introduce an API design concept called
                 ``ghosts of departed proofs'' based on the following
                 observation: sophisticated preconditions can be encoded
                 in Haskell's type system with no run-time overhead, by
                 using proofs that inhabit phantom type parameters
                 attached to newtype wrappers. The user expresses
                 correctness arguments by constructing proofs to inhabit
                 these phantom types. Critically, this technique allows
                 the library user to decide when and how to validate
                 that the API's preconditions are met. The ``ghosts of
                 departed proofs'' approach to API design can achieve
                 many of the benefits of dependent types and refinement
                 types, yet only requires some minor and well-understood
                 extensions to Haskell 2010. We demonstrate the utility
                 of this approach through a series of case studies,
                 showing how to enforce novel invariants for lists,
                 maps, graphs, shared memory regions, and more.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Vazou:2018:TPA,
  author =       "Niki Vazou and Joachim Breitner and Rose Kunkel and
                 David {Van Horn} and Graham Hutton",
  title =        "Theorem proving for all: equational reasoning in
                 liquid {Haskell} (functional pearl)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "132--144",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242756",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Equational reasoning is one of the key features of
                 pure functional languages such as Haskell. To date,
                 however, such reasoning always took place externally to
                 Haskell, either manually on paper, or mechanised in a
                 theorem prover. This article shows how equational
                 reasoning can be performed directly and seamlessly
                 within Haskell itself, and be checked using Liquid
                 Haskell. In particular, language learners --- to whom
                 external theorem provers are out of reach --- can
                 benefit from having their proofs mechanically checked.
                 Concretely, we show how the equational proofs and
                 derivations from Graham's textbook can be recast as
                 proofs in Haskell (spoiler: they look essentially the
                 same).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Barenz:2018:RFT,
  author =       "Manuel B{\"a}renz and Ivan Perez",
  title =        "{Rhine}: {FRP} with type-level clocks",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "145--157",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242757",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Processing data at different rates is generally a hard
                 problem in reactive programming. Buffering problems,
                 lags, and concurrency issues often occur. Many of these
                 problems are clock errors, where data at different
                 rates is combined incorrectly. Techniques to avoid
                 clock errors, such as type-level clocks and
                 deterministic scheduling, exist in the field of
                 synchronous programming, but are not implemented in
                 general-purpose languages like Haskell. Rhine is a
                 clock-safe library for synchronous and asynchronous
                 Functional Reactive Programming (FRP). It separates the
                 aspects of clocking, scheduling and resampling from
                 each other, and ensures clock-safety at the type level.
                 Concurrent communication is encapsulated safely.
                 Diverse reactive subsystems can be combined in a
                 coherent, declarative data-flow framework, while
                 correct interoperability of data at different rates is
                 guaranteed by type-level clocks. This provides a
                 general-purpose framework that simplifies multi-rate
                 FRP systems and can be used for game development, media
                 applications, GUIs and embedded systems, through a
                 flexible API with many reusable components.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Matsuda:2018:EIL,
  author =       "Kazutaka Matsuda and Meng Wang",
  title =        "Embedding invertible languages with binders: a case of
                 the {FliPpr} language",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "158--171",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242758",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "This paper describes a new embedding technique of
                 invertible programming languages, through the case of
                 the FliPpr language. Embedded languages have the
                 advantage of inheriting host languages' features and
                 supports; and one of the influential methods of
                 embedding is the tagless-final style, which enables a
                 high level of programmability and extensibility.
                 However, it is not straightforward to apply the method
                 to the family of invertible/reversible/bidirectional
                 languages, due to the different ways functions in such
                 domains are represented. We consider FliPpr, an
                 invertible pretty-printing system, as a representative
                 of such languages, and show that Atkey et al.'s
                 unembedding technique can be used to address the
                 problem. Together with a reformulation of FliPpr, our
                 embedding achieves a high level of interoperability
                 with the host language Haskell, which is not found in
                 any other invertible languages. We implement the idea
                 and demonstrate the benefits of the approach with
                 examples.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Han:2018:HPM,
  author =       "Dong Han and Tao He",
  title =        "A high-performance multicore {IO} manager based on
                 {\tt libuv} (experience report)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "172--178",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242759",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We present a high performance multicore I/O manager
                 based on libuv for Glasgow Haskell Compiler (GHC). The
                 new I/O manager is packaged as an ordinary Haskell
                 package rather than baked into GHC's runtime system(GHC
                 RTS), yet takes advantage of GHC RTS's comprehensive
                 concurrent support, such as lightweight threads and
                 safe/unsafe FFI options. The new I/O manager's
                 performance is comparable with existing implementation,
                 with greater stability under high load. It also can be
                 easily extended to support all of libuv's
                 callback-based APIs, allowing us to write a complete
                 high performance I/O toolkit without spending time on
                 dealing with OS differences or low-level I/O system
                 calls.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Gissurarson:2018:SVH,
  author =       "Matth{\'\i}as P{\'a}ll Gissurarson",
  title =        "Suggesting valid hole fits for typed-holes (experience
                 report)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "7",
  pages =        "179--185",
  month =        jul,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3299711.3242760",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Oct 16 14:12:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Type systems allow programmers to communicate a
                 partial specification of their program to the compiler
                 using types, which can then be used to check that the
                 implementation matches the specification. But can the
                 types be used to aid programmers during development? In
                 this experience report I describe the design and
                 implementation of my lightweight and practical
                 extension to the typed-holes of GHC that improves user
                 experience by adding a list of valid hole fits and
                 refinement hole fits to the error message of
                 typed-holes. By leveraging the type checker, these fits
                 are selected from identifiers in scope such that if the
                 hole is substituted with a valid hole fit, the
                 resulting expression is guaranteed to type check.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
  remark =       "Haskell '18 proceedings.",
}

@Article{Wilson:2018:BGT,
  author =       "Preston Tunnell Wilson and Ben Greenman and Justin
                 Pombrio and Shriram Krishnamurthi",
  title =        "The behavior of gradual types: a user study",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "8",
  pages =        "1--12",
  month =        oct,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393673.3276947",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393673.3276947",
  abstract =     "There are several different gradual typing semantics,
                 reflecting different trade-offs between performance and
                 type soundness guarantees. Notably absent, however, are
                 any data on which of these semantics developers
                 actually prefer. We begin to rectify \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Bodin:2018:TMF,
  author =       "Martin Bodin and Tom{\'a}s Diaz and {\'E}ric Tanter",
  title =        "A trustworthy mechanized formalization of {R}",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "8",
  pages =        "13--24",
  month =        oct,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393673.3276946",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393673.3276946",
  abstract =     "The R programming language is very popular for
                 developing statistical software and data analysis,
                 thanks to rich libraries, concise and expressive
                 syntax, and support for interactive programming. Yet,
                 the semantics of R is fairly complex, contains many
                 subtle corner cases, and is not formally specified.
                 This makes it difficult to reason about R programs. In
                 this work, we develop a big-step operational semantics
                 for R in the form of an interpreter written in the Coq
                 proof assistant. We ensure the trustworthiness of the
                 formalization by introducing a monadic encoding that
                 allows the Coq interpreter, CoqR, to be in direct
                 visual correspondence with the reference R interpreter,
                 GNU R. Additionally, we provide a testing framework
                 that supports systematic comparison of CoqR and GNU R.
                 In its current state, CoqR covers the nucleus of the R
                 language as well as numerous additional features,
                 making it pass a significant number of realistic test
                 cases from the GNU R and FastR projects. To exercise
                 the formal specification, we prove in Coq the
                 preservation of memory invariants in selected parts of
                 the interpreter. This work is an important first step
                 towards a robust environment for formal verification of
                 R programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Marron:2018:LLC,
  author =       "Mark Marron",
  title =        "Log++ logging for a cloud-native world",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "8",
  pages =        "25--36",
  month =        oct,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393673.3276952",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393673.3276952",
  abstract =     "Logging is a fundamental part of the software
                 development and deployment lifecycle but logging
                 support is often provided as an afterthought via
                 limited library APIs or third-party modules. Given the
                 critical nature of logging in modern cloud, mobile,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chen:2018:HBA,
  author =       "Hanfeng Chen and Joseph Vinish D'Silva and Hongji Chen
                 and Bettina Kemme and Laurie Hendren",
  title =        "{HorseIR}: bringing array programming languages
                 together with database query processing",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "8",
  pages =        "37--49",
  month =        oct,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393673.3276951",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393673.3276951",
  abstract =     "Relational database management systems (RDBMS) are
                 operationally similar to a dynamic language processor.
                 They take SQL queries as input, dynamically generate an
                 optimized execution plan, and then execute it. In
                 recent decades, the emergence of in-. \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Serrano:2018:JAC,
  author =       "Manuel Serrano",
  title =        "{JavaScript AOT} compilation",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "8",
  pages =        "50--63",
  month =        oct,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393673.3276950",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393673.3276950",
  abstract =     "Static compilation, a.k.a., ahead-of-time (AOT)
                 compilation, is an alternative approach to JIT
                 compilation that can combine good speed and lightweight
                 memory footprint, and that can accommodate read-only
                 memory constraints that are imposed by some devices and
                 some operating systems. Unfortunately the highly
                 dynamic nature of JavaScript makes it hard to compile
                 statically and all existing AOT compilers have either
                 gave up on good performance or full language support.
                 We have designed and implemented an AOT compiler that
                 aims at satisfying both. It supports full unrestricted
                 ECMAScript 5.1 plus many ECMAScript 2017 features and
                 the majority of benchmarks are within 50\% of the
                 performance of one of the fastest JIT compilers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Seginer:2018:QBO,
  author =       "Yoav Seginer and Theo Vosse and Gil Harari and Uri
                 Kolodny",
  title =        "Query-based object-oriented programming: a declarative
                 web of objects",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "8",
  pages =        "64--75",
  month =        oct,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393673.3276949",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393673.3276949",
  abstract =     "We present a declarative, object-oriented language in
                 which queries play a central role. Queries are used not
                 only to access data, but also to refer to the
                 application's object members and as a means of program
                 control. The language is fully declarative,. \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Chari:2018:SCD,
  author =       "Guido Chari and Javier Pim{\'a}s and Jan Vitek and
                 Olivier Fl{\"u}ckiger",
  title =        "Self-contained development environments",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "8",
  pages =        "76--87",
  month =        oct,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393673.3276948",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393673.3276948",
  abstract =     "Operating systems are traditionally implemented in
                 low- level, performance-oriented programming languages.
                 These languages typically rely on minimal runtime
                 support and provide unfettered access to the underlying
                 hardware. Tradition has benefits: \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Herrera:2018:NCW,
  author =       "David Herrera and Hanfeng Chen and Erick Lavoie and
                 Laurie Hendren",
  title =        "Numerical computing on the web: benchmarking for the
                 future",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "8",
  pages =        "88--100",
  month =        oct,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393673.3276968",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393673.3276968",
  abstract =     "Recent advances in execution environments for
                 JavaScript and WebAssembly that run on a broad range of
                 devices, from workstations and mobile phones to IoT
                 devices, provide new opportunities for portable and
                 web-based numerical computing. Indeed, numerous
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Smeltzer:2018:DSL,
  author =       "Karl Smeltzer and Martin Erwig",
  title =        "A domain-specific language for exploratory data
                 visualization",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "1--13",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278138",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278138",
  abstract =     "With an ever-growing amount of collected data, the
                 importance of visualization as an analysis component is
                 growing in concert. The creation of good visualizations
                 often doesn't happen in one step but is rather an
                 iterative and exploratory process. \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Stucki:2018:PUM,
  author =       "Nicolas Stucki and Aggelos Biboudis and Martin
                 Odersky",
  title =        "A practical unification of multi-stage programming and
                 macros",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "14--27",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278139",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278139",
  abstract =     "Program generation is indispensable. We propose a
                 novel unification of two existing metaprogramming
                 techniques: multi-stage programming and hygienic
                 generative macros. The former supports runtime code
                 generation and execution in a type-safe manner while
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Hatch:2018:RRI,
  author =       "William Gallard Hatch and Matthew Flatt",
  title =        "{Rash}: from reckless interactions to reliable
                 programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "28--39",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278129",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278129",
  abstract =     "Command languages like the Bourne Shell provide a
                 terse syntax for exploratory programming and system
                 interaction. Shell users can begin to write programs
                 that automate their tasks by simply copying their
                 interactions verbatim into a script file. \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Soares:2018:EFI,
  author =       "Larissa Rocha Soares and Jens Meinicke and Sarah Nadi
                 and Christian K{\"a}stner and Eduardo Santana de
                 Almeida",
  title =        "Exploring feature interactions without specifications:
                 a controlled experiment",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "40--52",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278127",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278127",
  abstract =     "In highly configurable systems, features may interact
                 unexpectedly and produce faulty behavior. Those faults
                 are not easily identified from the analysis of each
                 feature separately, especially when feature
                 specifications are missing. We propose \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Khalaj:2018:IOD,
  author =       "Ebrahim Khalaj and Marwan Abi-Antoun",
  title =        "Inferring ownership domains from refinements",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "53--65",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278128",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278128",
  abstract =     "Ownership type qualifiers clarify aliasing invariants
                 that cannot be directly expressed in mainstream
                 programming languages. Adding qualifiers to code,
                 however, often involves significant overhead and
                 difficult interaction. We propose an analysis to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Volanschi:2018:ISC,
  author =       "Nic Volanschi and Bernard Serpette and Charles
                 Consel",
  title =        "Implementing a semi-causal domain-specific language
                 for context detection over binary sensors",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "66--78",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278134",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278134",
  abstract =     "In spite of the fact that many sensors in use today
                 are binary (i.e. produce only values of 0 and 1), and
                 that useful context-aware applications are built
                 exclusively on top of them, there is currently no
                 development approach specifically targeted to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Susungi:2018:MPC,
  author =       "Adilla Susungi and Norman A. Rink and Albert Cohen and
                 Jeronimo Castrillon and Claude Tadonki",
  title =        "Meta-programming for cross-domain tensor
                 optimizations",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "79--92",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278131",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278131",
  abstract =     "Many modern application domains crucially rely on
                 tensor operations. The optimization of programs that
                 operate on tensors poses difficulties that are not
                 adequately addressed by existing languages and tools.
                 Frameworks such as TensorFlow offer good \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Peldszus:2018:MBS,
  author =       "Sven Peldszus and Daniel Str{\"u}ber and Jan
                 J{\"u}rjens",
  title =        "Model-based security analysis of feature-oriented
                 software product lines",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "93--106",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278126",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278126",
  abstract =     "Today's software systems are too complex to ensure
                 security after the fact --- security has to be built
                 into systems by design. To this end, model-based
                 techniques such as UMLsec support the design-time
                 specification and analysis of security requirements
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Christophe:2018:ODA,
  author =       "Laurent Christophe and Coen {De Roover} and Elisa
                 Gonzalez Boix and Wolfgang {De Meuter}",
  title =        "Orchestrating dynamic analyses of distributed
                 processes for full-stack {JavaScript} programs",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "107--118",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278135",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278135",
  abstract =     "Dynamic analyses are commonly implemented by
                 instrumenting the program under analysis. Examples of
                 such analyses for JavaScript range from checkers of
                 user- defined invariants to concolic testers. For a
                 full-stack JavaScript program, these analyses
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Ruland:2018:MES,
  author =       "Sebastian Ruland and Lars Luthmann and Johannes
                 B{\"u}rdek and Sascha Lity and Thomas Th{\"u}m and
                 Malte Lochau and M{\'a}rcio Ribeiro",
  title =        "Measuring effectiveness of sample-based product-line
                 testing",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "119--133",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278130",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278130",
  abstract =     "Recent research on quality assurance (QA) of
                 configurable software systems (e.g., software product
                 lines) proposes different analysis strategies to cope
                 with the inherent complexity caused by the well-known
                 combinatorial-explosion problem. Those \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Zhang:2018:PMO,
  author =       "Weixin Zhang and Bruno C. d. S. Oliveira",
  title =        "Pattern matching in an open world",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "134--146",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278124",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278124",
  abstract =     "Pattern matching is a pervasive and useful feature in
                 functional programming. There have been many attempts
                 to bring similar notions to Object-Oriented Programming
                 (OOP) in the past. However, a key challenge in OOP is
                 how pattern matching can coexist \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Al-Sibahi:2018:VHL,
  author =       "Ahmad Salim Al-Sibahi and Thomas P. Jensen and
                 Aleksandar S. Dimovski and Andrzej Wasowski",
  title =        "Verification of high-level transformations with
                 inductive refinement types",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "147--160",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278125",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278125",
  abstract =     "High-level transformation languages like Rascal
                 include expressive features for manipulating large
                 abstract syntax trees: first-class traversals,
                 expressive pattern matching, backtracking and
                 generalized iterators. We present the design and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Cunha:2018:ESS,
  author =       "J{\'a}come Cunha and Mihai Dan and Martin Erwig and
                 Danila Fedorin and Alex Grejuc",
  title =        "Explaining spreadsheets with spreadsheets (short
                 paper)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "161--167",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278136",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278136",
  abstract =     "Based on the concept of explanation sheets, we present
                 an approach to make spreadsheets easier to understand
                 and thus easier to use and maintain. We identify the
                 notion of explanation soundness and show that
                 explanation sheets which conform to simple \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{vanBinsbergen:2018:FHF,
  author =       "L. Thomas van Binsbergen",
  title =        "Funcons for {HGMP}: the fundamental constructs of
                 homogeneous generative meta-programming (short paper)",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "168--174",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278132",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278132",
  abstract =     "The PLanCompS project proposes a component-based
                 approach to programming-language development in which
                 fundamental constructs (funcons) are reused across
                 language definitions. Homogeneous Generative
                 Meta-Programming (HGMP) enables writing programs that
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Liu:2018:RTA,
  author =       "Yin Liu and Kijin An and Eli Tilevich",
  title =        "{RT-trust}: automated refactoring for trusted
                 execution under real-time constraints",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "175--187",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278137",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278137",
  abstract =     "Real-time systems must meet strict timeliness
                 requirements. These systems also often need to protect
                 their critical program information (CPI) from
                 adversarial interference and intellectual property
                 theft. Trusted execution environments (TEE) execute CPI
                 \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Nieke:2018:AAF,
  author =       "Michael Nieke and Jacopo Mauro and Christoph Seidl and
                 Thomas Th{\"u}m and Ingrid Chieh Yu and Felix Franzke",
  title =        "Anomaly analyses for feature-model evolution",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "188--201",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278123",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278123",
  abstract =     "Software Product Lines (SPLs) are a common technique
                 to capture families of software products in terms of
                 commonalities and variabilities. On a conceptual level,
                 functionality of an SPL is modeled in terms of features
                 in Feature Models (FMs). As other \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}

@Article{Radanne:2018:RLG,
  author =       "Gabriel Radanne and Peter Thiemann",
  title =        "{Regenerate}: a language generator for extended
                 regular expressions",
  journal =      j-SIGPLAN,
  volume =       "53",
  number =       "9",
  pages =        "202--214",
  month =        nov,
  year =         "2018",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3393934.3278133",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Wed Apr 8 13:49:51 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3393934.3278133",
  abstract =     "Regular expressions are part of every programmer's
                 toolbox. They are used for a wide variety of
                 language-related tasks and there are many algorithms
                 for manipulating them. In particular, matching
                 algorithms that detect whether a word belongs to the
                 language described by a regular expression are well
                 explored, yet new algorithms appear frequently.
                 However, there is no satisfactory methodology for
                 testing such matchers. We propose a testing methodology
                 which is based on generating positive as well as
                 negative examples of words in the language. To this
                 end, we present a new algorithm to generate the
                 language described by a generalized regular expression
                 with intersection and complement operators. The
                 complement operator allows us to generate both positive
                 and negative example words from a given regular
                 expression. We implement our generator in Haskell and
                 OCaml and show that its performance is more than
                 adequate for testing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "https://dl.acm.org/loi/sigplan",
}